#' Perform unsupervised clustering to capture population structure based on
#' iterative pruning
#'
#' @description This version supports ordinal data which can be applied directly
#' to SNP data to identify fine-scale population structure. It was built on the
#' iterative pruning Principal Component Analysis (ipPCA) algorithm
#' (Intarapanich et al., 2009; Limpiti et al., 2011). The ipcaps involves an
#' iterative process using multiple splits based on multivariate Gaussian
#' mixture modeling of principal components and Clustering EM estimation (Lebret
#' et al., 2015). In each iteration, rough clusters and outliers are also
#' identified using our own method called rubikclust (R package \pkg{KRIS}).
#'
#' @param bed A PLINK binary format consists of 3 files; bed, bim, and fam. To
#' generate these files from PLINK, use option –make-bed. See more details at:
#' \url{http://zzz.bwh.harvard.edu/plink/data.shtml}.
#' @param rdata In case of re-analysis, it is convenient to run ipcaps using the
#' file rawdata.RData generated by ipcaps. This file contains a matrix of SNPs
#' (raw.data) and a vector of labels (label).
#' @param files If the data type is 'snp', ipcaps supports SNPs encoded as 0, 1
#' and 2 (dosage encoding). If the data type is 'linear', the data must be
#' numeric and text and words are not allowed in the input files.
#' Rows represent SNPs or features and columns represent individuals or
#' subjects. Each column needs to be separated by a space or a tab. A big text
#' file should be divided into smaller files to load faster. For instance, to
#' input 3 files, use as: files=c('input1.txt', 'input2.txt', 'input3.txt').
#' @param label.file An additional useful information (called 'labels' in
#' ipcaps) related subject, for example, geographic location or disease
#' phenotype. These labels (one at a time) are used in displaying the clustering
#' outcome of ipcaps. A label file must contain at least one column. However,
#' it may contain more than one column in which case each column need to be
#' separated by a space or a tab.
#' @param lab.col The label in the label file to be used in the tree-like
#' display of ipcaps clustering results.
#' @param out To set an absolute path for ipcaps output. If the specified output
#' directory already exists, result files are saved in sub-directories
#' cluster_out, cluster_out1, cluster_out2, etc.
#' @param plot.as.pdf To export plots as PDF. When omitted, plots are saved as
#' PNG.
#' @param method The internal clustering method. It can be set to 'mix'
#' (rubikclust & mixmod), 'mixmod' (Lebret et al., 2015), 'clara' (R: Clustering
#' Large Applications), 'pam' (R: Partitioning Around Medoids (PAM) Object),
#' 'meanshift' (Wang, 2016), 'apcluster' (Bodenhofer et al., 2016), and 'hclust'
#' (R: Hierarchical Clustering). Default = 'mix'.
#' @param missing Symbol used for missing genotypes. Default = NA.
#' @param covariate A file of covariates; one covariate per column. SNPs can be
#' adjusted for these covariates via regression modeling and residual
#' computation.
#' @param cov.col.first Refer to a covariate file, the first covariate to be
#' considered as confounding variable.
#' @param cov.col.last Refer to a covariate file, the last covariate to be
#' considered as confounding variable. All the variables in between the
#' cov.col.first and cov.col.last will be considered in the adjustment process.
#' @param threshold Cutoff value for EigenFit. Possible values range from 0.03
#' to 0.18. The smaller, the potentially finer the substructure can be. Default
#' = 0.18.
#' @param min.fst Minimum Fst between a pair of subgroups. Default = 0.0008.
#' @param min.in.group Minimum number of individuals to constitute a cluster or
#' subgroup. Default = 20.
#' @param no.plot No plot is generated if this option is TRUE. This option is
#' useful when the system does not support X Windows in the unix based system.
#' Default = FALSE.
#' @param data.type To specify which type of input data between 'snp' and
#' 'linear'. Default = 'snp'.
#' @param silence To enable or disable silence mode. If silence mode is enabled,
#' the fuction is processed without printing any message on the screen, and
#' it is slightly faster. Default = TRUE.
#' @param max.thread To specify a number of threads in order to run an analysis
#' in parallel. If max.thread is specified more than the maximum number of CPU
#' cores, then the maximum number of CPU cores are used instead. If max.thread
#' is specified as floating point number, it will be rounded up using the
#' function round(). Default = 0, which the maximum number of CPU cores are
#' used.
#' @param seed To specify a seed number for the function mixmod in the package
#' Rmixmod. Default = NA, which means that a seed number is automatically chose.
#'
#' @return Returns the list object containing 2 internal objects;
#' output.dir as class character and cluster as class data.frame. The object
#' output.dir stores a result directory. The object cluster contains 4 columns,
#' group, node, label, and row.number. The column group contains the assigned
#' groups from ipcaps. The column node contains node numbers in a tree as
#' illustrated in the HTML result files. The column label contains the given
#' labels that is set to the parameter label. The column row.number contains
#' indices to an input data, which is matched to row numbers of input matrix.
#' All clustering result files are saved in one directory (as specified by out)
#' containing assigned groups, hierarchical trees of group membership, PC plots,
#' and binary data for further analysis.
#' \itemize{
#' \item \code{$snp} is a SNP matrix from BED file.
#' \item \code{$snp.info} is a data.frame of SNP information from BIM file.
#' \item \code{$ind.info} is a data.frame of individual information from FAM
#' file.
#' }
#' If function return \code{NULL}, it means the input files are not in proper
#' format.
#'
#' @details The computational time depends on the number of individuals.
#' Consequentially, if large data sets are analyzed, it may be necessary first
#' to split data into smaller files, and then load into computer memory (with
#' parameter 'files'). Internally, the split files are merged to construct
#' a com-prehensive matrix.
#'
#' @export
#'
#' @import parallel
#' @import foreach
# @import doMC
#' @import doParallel
#' @importFrom snow snow.time
#'
#' @include preprocess.R
#' @include postprocess.R
#' @include process.each.node.R
#'
#' @md
#'
#' @references
#'
#' Bodenhofer, U., Palme, J., Melkonian, C., and Kothmeier, A. (2016). apcluster
#' : Affinity Propagation Clustering. Available at: \url{https://CRAN.R-project.
#' org/package=apcluster} (Accessed March 7, 2017).
#'
#' Intarapanich, A., Shaw, P. J., Assawamakin, A., Wangkumhang, P., Ngamphiw, C.
#' , Chaichoompu, K., et al. (2009). Iterative pruning PCA improves resolution
#' of highly structured populations. BMC Bioinformatics 10, 382. doi:10.1186/
#' 1471-2105-10-382.
#'
#' Lebret, R., Iovleff, S., Langrognet, F., Biernacki, C., Celeux, G., and
#' Govaert, G. (2015). Rmixmod: TheRPackage of the Model-Based Unsupervised,
#' Supervised, and Semi-Supervised ClassificationMixmodLibrary. J. Stat. Softw.
#' 67. doi:10.18637/jss.v067.i06.
#'
#' Limpiti, T., Intarapanich, A., Assawamakin, A., Shaw, P. J., Wangkumhang, P.,
#' Piriyapongsa, J., et al. (2011). Study of large and highly stratified
#' population datasets by combining iterative pruning principal component
#' analysis and structure. BMC Bioinformatics 12, 255. doi:10.1186/1471-2105-12-
#' 255.
#'
#' Maechler, M., Rousseeuw, P., Struyf, A., Hubert, M., and Hornik, K. (2017).
#' cluster: Cluster Analysis Basics and Extensions.
#'
#' R: Clustering Large Applications Available at: \url{https://stat.ethz.ch/
#' R-manual/R-devel/library/cluster/html/clara.html} (Accessed March 7, 2017).
#'
#' R Core Team (2017). R: A Language and Environment for Statistical Computing.
#' Vienna, Austria: R Foundation for Statistical Computing Available at:
#' \url{https://www.R-project.org/}.
#'
#' R: Hierarchical Clustering Available at: \url{https://stat.ethz.ch/R-manual/
#' R-devel/library/stats/html/hclust.html} (Accessed March 7, 2017).
#'
#' R: Partitioning Around Medoids (PAM) Object Available at: \url{https://stat.
#' ethz.ch/R-manual/R-devel/library/cluster/html/pam.object.html} (Accessed
#' March 7, 2017).
#'
#' Wang, M. C. and D. (2016). MeanShift: Clustering via the Mean Shift
#' Algorithm. Available at: \url{https://CRAN.R-project.org/package=MeanShift}
#' (Accessed March 7, 2017).
#'
#' @examples
#'
#' #Use the BED format as input
#' #Importantly, bed file, bim file, and fam file are required
#' #Use the example files embedded in the package
#'
#' BED.file <- system.file('extdata',
#' 'ipcaps_example.bed',
#' package = 'IPCAPS.BIOC')
#' LABEL.file <- system.file('extdata',
#' 'ipcaps_example_individuals.txt.gz',
#' package = 'IPCAPS.BIOC')
#'
#' my.cluster1 <- ipcaps(bed = BED.file,
#' label.file = LABEL.file,
#' lab.col = 2,
#' out = tempdir(),
#' max.thread = 1,
#' seed = 1234)
#'
#' table(my.cluster1$cluster$label, my.cluster1$cluster$group)
#'
#' # Use a text file as input
#' # Use the example files embedded in the package
#'
#' text.file <- system.file('extdata',
#' 'ipcaps_example_rowVar_colInd.txt.gz',
#' package = 'IPCAPS.BIOC')
#'
#' LABEL.file <- system.file('extdata',
#' 'ipcaps_example_individuals.txt.gz',
#' package = 'IPCAPS.BIOC')
#'
#' my.cluster2 <- ipcaps(files = c(text.file),
#' label.file = LABEL.file,
#' lab.col = 2,
#' out=tempdir(),
#' max.thread = 1,
#' seed = 1234)
#'
#' table(my.cluster2$cluster$label, my.cluster2$cluster$group)
#'
#' # Use an R Data file as input
#' # Use the example file embedded in the package
#'
#' rdata.file <- system.file('extdata',
#' 'ipcaps_example.RData',
#' package = 'IPCAPS.BIOC')
#'
#' my.cluster3 <- ipcaps(rdata = rdata.file,
#' out = tempdir(),
#' max.thread = 1,
#' seed = 1234)
#'
#' table(my.cluster3$cluster$label, my.cluster3$cluster$group)
#'
ipcaps <-
function(bed = NA,
rdata = NA,
files = NA,
label.file = NA,
lab.col = 1,
out,
plot.as.pdf = FALSE,
method = "mix",
missing = NA,
covariate = NA,
cov.col.first = NA,
cov.col.last = NA,
threshold = 0.18,
min.fst = 8e-04,
min.in.group = 20,
no.plot = FALSE,
data.type = "snp",
silence = FALSE,
max.thread = 0,
seed = NA)
{
filename.label <- label.file
label.column <- lab.col
result.dir <- out
rerun <- FALSE
rdata.infile <- rdata
bed.infile <- bed
datatype <- data.type
nonlinear <- FALSE
missing.char <- missing
regression.file <- covariate
regression.col.first <- cov.col.first
regression.col.last <- cov.col.last
file.list <- files
max.thread <- max.thread
cate.list <- NA
silence.mode <- silence
seed.num <- seed
start.time <- Sys.time()
if (length(threshold) <= 0)
{
threshold <- 0.18
# work in general. The lowest value is 0.03 for the data without
# outlier
}
if (length(min.fst) <= 0)
{
min.fst <- 8e-04
}
usage <- paste0("Usage: ?ipcaps\n")
if (length(result.dir) == 0)
{
cat(usage)
quit()
}
if ((length(filename.label) == 0 || is.na(file.list)) &&
(length(rdata.infile) == 0) &&
(length(bed.infile) == 0) &&
is.na(cate.list))
{
cat(usage)
quit()
}
if (silence.mode != FALSE)
{
# To protect the error from user's input
silence.mode <- TRUE
}
if (!silence.mode)
cat(paste0("Running ... ipcaps \n\toutput: ", result.dir, " \n"))
if (length(filename.label) > 0)
{
if (!silence.mode)
cat(paste0("\tlabel file: ", filename.label, "\n"))
}
if (length(label.column) > 0)
{
if (is.na(as.integer(label.column)))
{
# except only comma as separator, otherwise
if (length(strsplit(label.column, ",")[[1]]))
{
if (anyNA(as.integer(strsplit(label.column, ",")[[1]])))
{
label.column <- 1
if (!silence.mode)
cat(paste0("\tlabel column: ", label.column, "\n"))
} else
{
label.column <- as.integer(strsplit
(label.column, ",")[[1]])
}
} else
{
label.column <- 1
if (!silence.mode)
cat(paste0("\tlabel column: ", label.column, "\n"))
}
} else
{
label.column <- as.integer(label.column)
if (!silence.mode)
cat(paste0("\tlabel column: ", label.column, "\n"))
}
} else
{
label.column <- 1
}
if (length(threshold) > 0)
{
if (!silence.mode)
cat(paste0("\tthreshold: ", threshold, "\n"))
}
if (length(min.fst) > 0)
{
if (!silence.mode)
cat(paste0("\tminimum Fst: ", min.fst, "\n"))
}
if (length(seed) <= 0)
{
seed.num <- NA
}
if (is.na(seed))
{
seed.num <- NA
} else
{
seed.num <- as.integer(seed)
}
if (length(max.thread) <= 0)
{
max.thread <- detectCores()
}
if (is.na(max.thread))
{
max.thread <- detectCores()
} else
{
max.thread <- round(max.thread)
if (max.thread < 1)
{
max.thread <- detectCores()
} else if (max.thread > detectCores())
{
max.thread <- detectCores()
}
# register for parallel threads
#registerDoMC(max.thread)
threads <- makeCluster(max.thread,type="SOCK")
registerDoParallel(threads)
}
if (!silence.mode)
cat(paste0("\tmaximum thread: ", max.thread, "\n"))
if (length(method) > 0)
{
if (!silence.mode)
cat(paste0("\tmethod: ", method, "\n"))
} else
{
method <- "mix"
}
if (!is.na(rdata.infile))
{
if (file.exists(rdata.infile))
{
if (!silence.mode)
cat(paste0("\trdata: ", rdata.infile, "\n"))
} else
{
rdata.infile <- NA
}
} else
{
rdata.infile <- NA
}
if (!is.na(bed.infile))
{
if (file.exists(bed.infile))
{
if (!silence.mode)
cat(paste0("\tbed: ", bed.infile, "\n"))
} else
{
bed.infile <- NA
}
} else
{
bed.infile <- NA
}
if (!is.na(file.list))
{
if (!silence.mode)
cat(paste0("\tfiles: \n"))
print(file.list)
}
if (plot.as.pdf)
{
if (!silence.mode)
cat(paste0("\tplot.as.pdf: TRUE\n"))
plot.as.pdf <- TRUE
} else
{
plot.as.pdf <- FALSE
}
if (length(min.in.group) > 0)
{
if (min.in.group < 5)
{
min.in.group <- 5
}
if (!silence.mode)
cat(paste0("\tminimum in group: ", min.in.group, "\n"))
} else
{
min.in.group <- 20
}
if (length(datatype) > 0)
{
if (datatype %in% c("snp", "linear"))
{
if (!silence.mode)
cat(paste0("\tdata type: ", datatype, "\n"))
} else
{
if (!silence.mode)
cat(paste0("\tInvalid data type (", datatype, "), "))
datatype <- "linear"
if (!silence.mode)
cat(paste0("assume the data type to be ", datatype, "\n"))
}
} else
{
datatype <- "snp"
}
if (length(missing.char) > 0)
{
if (!silence.mode)
cat(paste0("\tmissing: ", missing.char, "\n"))
} else
{
missing.char <- NA
}
if (length(regression.file) > 0 && !is.na(regression.file))
{
if (!silence.mode)
cat(paste0("\tcovariate file: ", regression.file, "\n"))
} else
{
regression.file <- NA
}
if (!is.na(regression.col.first) && regression.col.first > 0)
{
if (!silence.mode)
cat(paste0(
"\tcovariate first column: ",
regression.col.first,
"\n"
))
} else
{
regression.col.first <- NA
}
if (!is.na(regression.col.last) && regression.col.last > 0)
{
if (!silence.mode)
cat(paste0("\tcovariate last column: ", regression.col.last,
"\n"))
} else
{
regression.col.last <- NA
}
if (silence.mode == FALSE)
{
cat(paste0("\tsilence: FALSE\n"))
}
# preprocessing step
if (!silence.mode)
cat(paste0("In preprocessing step\n"))
result.dir <-
preprocess(
files = file.list,
label.file = filename.label,
lab.col = label.column,
rdata.infile = rdata.infile,
bed.infile = bed.infile,
cate.list = cate.list,
result.dir = result.dir,
threshold = threshold,
min.fst = min.fst,
reanalysis = rerun,
method = method,
min.in.group = min.in.group,
datatype = datatype,
nonlinear = nonlinear,
missing.char = missing.char,
regression.file = regression.file,
regression.col.first = regression.col.first,
regression.col.last = regression.col.last,
plot.as.pdf = plot.as.pdf,
no.plot = no.plot,
silence.mode = silence.mode,
max.thread = max.thread,
seed = seed.num
)
if (is.null(result.dir))
{
return(NULL)
}
# job scheduler
if (!silence.mode)
cat(paste0("Start calculating\n"))
# create the first task for Node 1 status 0 = unprocessed, 1 =
# processing, 2 = done , -1 = deleted
node <- c(1)
parent.node <- c(0)
status <- c(0)
tree <- data.frame(node, parent.node, status)
file.name <- file.path(result.dir, "RData", "tree.RData")
save(tree, file = file.name, compress = "bzip2")
# use global.lock to control threads, not to update the same file at
# the same time
myenv <- new.env()
parent.env(myenv)
myenv$global.lock <- FALSE
while (TRUE)
{
file.name <- file.path(result.dir, "RData", "tree.RData")
if (isTRUE(myenv$global.lock))
{
Sys.sleep(runif(1, min = 0, max = 2))
} else
{
myenv$global.lock <- TRUE
load(file = file.name)
myenv$global.lock <- FALSE
}
# check for terminate loop
row2 <- which(tree$status == 2)
row_1 <- which(tree$status == -1)
if ((length(row2) + length(row_1)) == length(tree$node))
{
break
}
file.name <- file.path(result.dir, "RData", "condition.RData")
load(file = file.name)
# take one node to process
which.row <- which(tree$status == 0)
if (length(which.row) > 0)
{
exe.node.list <- tree$node[which.row]
# serial loop
# for (idx in exe.node.list) {
# process.each.node(node =
# idx, work.dir = result.dir)
# parallel version
foreach(thread_idx = seq(1, length(exe.node.list))) %dopar%
{
process.each.node(node = exe.node.list[thread_idx],
work.dir = result.dir)
}
} else
{
break
}
}
if (!silence.mode)
cat(paste0("In post process step\n"))
cluster.tab <- postprocess(result.dir = result.dir)
end.time <- Sys.time()
run.time <- as.numeric(end.time - start.time, units = "secs")
if (!silence.mode)
cat(paste0("Total runtime is ", run.time, " sec\n"))
file.name <- file.path(result.dir, "RData", "runtime.RData")
save(run.time, file = file.name, compress = "bzip2")
cluster.obj <-
list(output.dir = result.dir, cluster = cluster.tab)
file.name <- file.path(result.dir, "RData", "result.RData")
save(cluster.obj, file = file.name, compress = "bzip2")
# To stop all threads
stopCluster(threads)
return(cluster.obj)
}
# Check the result files in your output directory groups.txt contains
# the assigned groups of samples tree_text.html is the text result
# shown as binary tree tree_scatter_cluster.html is the scatter plots
# colored by clustering result tree_scatter_label.html is the scatter
# plots colored by labels tree_scree.html is the scree plots of eigen
# values
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.