#' Call Peaks Using MACS2
#'
#' Identify peaks using selected cells. Fragments belonging to a subset of cells
#' are extracted and used to identify peaks using MACS2. This function requires
#' "MACS2" and "snaptools" preinstalled and excutable.
#'
#' @param obj A snap object.
#' @param output.prefix Prefix of output file which will be used to generate output file names.
#' @param path.to.snaptools Path to snaptools excutable file.
#' @param path.to.macs Path to macs2 excutable file.
#' @param gsize effective genome size. 'hs' for human, 'mm' for mouse, 'ce' for C. elegans, 'dm' for fruitfly (default: None)
#' @param buffer.size Buffer size for incrementally increasing internal array size to store reads alignment information. In
#' most cases, you don't have to change this parameter. However, if there are very high coverage dataset that each barcode has
#' more than 10000 fragments, it's recommended to specify a smaller buffer size in order to decrease memory usage (but it will
#' take longer time to read snap files) [1000].
#' @param macs.options String indicate options you would like passed to macs2. strongly do not recommand to change unless you
#' know what you are doing. the default is '--nomodel --shift 37 --ext 73 --qval 1e-2 -B --SPMR --call-summits'.
#' @param tmp.folder Directory to store temporary files generated by runMACS.
#' @param num.cores Number of cores to use for omputing [1].
#' @param keep.minimal Keep minimal version of output [TRUE].
#' @return Return a data.frame object that contains the peak information
#'
#' @export
runMACS <- function(
obj,
output.prefix,
path.to.snaptools,
path.to.macs,
gsize,
tmp.folder,
buffer.size=500,
macs.options="--nomodel --shift 37 --ext 73 --qval 1e-2 -B --SPMR --call-summits",
num.cores=1,
keep.minimal=TRUE
){
cat("Epoch: checking input parameters ... \n", file = stderr())
if(missing(obj)){
stop("obj is missing")
}else{
if(!is.snap(obj)){
stop("obj is not a snap object");
}
if((x=nrow(obj))==0L){
stop("obj is empty");
}
if((x=length(obj@barcode))==0L){
stop("obj@barcode is empty");
}
if((x=length(obj@file))==0L){
stop("obj@file is empty");
}
}
fileList = as.list(unique(obj@file));
# check if snap files exist
if(any(do.call(c, lapply(fileList, function(x){file.exists(x)})) == FALSE)){
idx = which(do.call(c, lapply(fileList, function(x){file.exists(x)})) == FALSE)
print("error: these files does not exist")
print(fileList[idx])
stop()
}
# check if files are all snap files
if(any(do.call(c, lapply(fileList, function(x){isSnapFile(x)})) == FALSE)){
idx = which(do.call(c, lapply(fileList, function(x){isSnapFile(x)})) == FALSE)
print("error: these files are not snap file")
print(fileList[idx])
stop()
}
# check if FM session exist
if(any(do.call(c, lapply(fileList, function(x){ "FM" %in% h5ls(x, recursive=1)$name })) == FALSE)){
idx = which(do.call(c, lapply(fileList, function(x){ "FM" %in% h5ls(x, recursive=1)$name })) == FALSE)
print("error: the following nsap files do not contain FM session")
print(fileList[idx])
stop()
}
if(missing(output.prefix)){
stop("output.prefix is missing");
}
if(missing(path.to.snaptools)){
stop("path.to.snaptools is missing");
}else{
if(!file.exists(path.to.snaptools)){
stop("path.to.snaptools does not exist");
}
flag = tryCatch({
file_test('-x', path.to.snaptools);
},
error=function(cond){
return(FALSE)
})
if(flag == FALSE){
stop("path.to.snaptools is not an excutable file");
}
}
if(missing(path.to.macs)){
stop("path.to.macs is missing");
}else{
if(!file.exists(path.to.macs)){
stop("path.to.macs does not exist");
}
flag = tryCatch({
file_test('-x', path.to.macs);
},
error=function(cond){
return(FALSE)
})
if(flag == FALSE){
stop("path.to.macs is not an excutable file");
}
}
if(missing(gsize)){
stop("gsize is missing");
}
if(missing(tmp.folder)){
stop("tmp.folder is missing")
}else{
if(!dir.exists(tmp.folder)){
stop("tmp.folder does not exist");
}
}
# write the following barcodes down
barcode.files = lapply(fileList, function(file){
tempfile(tmpdir = tmp.folder, fileext = ".barcode.txt");
})
bed.files = lapply(fileList, function(file){
tempfile(tmpdir = tmp.folder, fileext = ".bed.gz");
})
# write down the barcodes
cat("Epoch: extracting fragments from each snap files ...\n", file = stderr())
flag.list = lapply(seq(fileList), function(i){
file.name = fileList[[i]];
idx = which(obj@file == file.name);
barcode.use = obj@barcode[idx]
write.table(barcode.use, file = barcode.files[[i]], append = FALSE, quote = FALSE, sep = "\t",
eol = "\n", na = "NA", dec = ".", row.names = FALSE,
col.names = FALSE, qmethod = c("escape", "double"),
fileEncoding = "")
})
# extract the fragments belong to the barcodes
flag.list = mclapply(seq(fileList), function(i){
flag = system2(command=path.to.snaptools,
args=c("dump-fragment",
"--snap-file", fileList[[i]],
"--output-file", bed.files[[i]],
"--barcode-file", barcode.files[[i]],
"--buffer-size", buffer.size
)
)
}, mc.cores=num.cores);
# combine these two bed files
combined.bed = tempfile(tmpdir = tmp.folder, fileext = ".bed.gz");
flag = system2(command="cat",
args=c(paste(bed.files, collapse = ' '),
">", combined.bed
)
)
# call peaks using MACS2
flag = system2(command=path.to.macs,
args=c("callpeak",
"-t", combined.bed,
"-f", "BED",
"-g", gsize,
macs.options,
"-n", output.prefix
)
)
if (flag != 0) {
stop("'MACS' call failed");
}
if(keep.minimal){
system(paste("rm ", output.prefix, "_control_lambda.bdg", sep=""));
system(paste("rm ", output.prefix, "_peaks.xls", sep=""));
system(paste("rm ", output.prefix, "_summits.bed", sep=""));
}
return(read.table(paste(output.prefix, "_peaks.narrowPeak", sep="")));
}
#' Call Peaks Using MACS2 For All Clusters
#'
#' Identify peaks for all clusters. Fragments belonging to each subset or cluster of cells
#' are extracted and used to identify peaks using MACS2. This function requires
#' "MACS2" and "snaptools" preinstalled and excutable.
#'
#' @param obj A snap object.
#' @param output.prefix Prefix of output file which will be used to generate output file names.
#' @param path.to.snaptools Path to snaptools excutable file.
#' @param path.to.macs Path to macs2 excutable file.
#' @param min.cells min number of cells to perform peak calling [100]. Clusters with cells less
#' than num.cells will be excluded.
#' @param num.cores number of cpus to use [1].
#' @param gsize effective genome size. 'hs' for human, 'mm' for mouse, 'ce' for C. elegans, 'dm' for fruitfly (default: None)
#' @param buffer.size Buffer size for incrementally increasing internal array size to store reads alignment information. In
#' most cases, you don't have to change this parameter. However, if there are very high coverage dataset that each barcode has
#' more than 10000 fragments, it's recommended to specify a smaller buffer size in order to decrease memory usage (but it will take longer time to read snap files) [1000].
#' @param macs.options String indicate options you would like passed to macs2. strongly do not recommand to change unless you know what you are doing. the default is '--nomodel --shift 37 --ext 73 --qval 1e-2 -B --SPMR --call-summits'.
#' @param tmp.folder Directory to store temporary files generated by runMACSForAll.
#' @param keep.minimal Keep minimal version of output [TRUE].
#' @return Return a GRanges object that contains the non-overlapping combined peaks
#' @importFrom parallel mclapply
#' @importFrom GenomicRanges GRanges reduce
#' @export
runMACSForAll <- function(
obj,
output.prefix,
path.to.snaptools,
path.to.macs,
gsize,
tmp.folder,
num.cores=1,
min.cells=100,
buffer.size=500,
macs.options="--nomodel --shift 37 --ext 73 --qval 1e-2 -B --SPMR --call-summits",
keep.minimal=TRUE
){
cat("Epoch: checking input parameters ... \n", file = stderr())
if(missing(obj)){
stop("obj is missing")
}else{
if(!is.snap(obj)){
stop("obj is not a snap object");
}
if((x=nrow(obj))==0L){
stop("obj is empty");
}
if((x=length(obj@barcode))==0L){
stop("obj@barcode is empty");
}
if((x=length(obj@file))==0L){
stop("obj@file is empty");
}
nclusters = length(levels(obj@cluster));
}
fileList = as.list(unique(obj@file));
# check if snap files exist
if(any(do.call(c, lapply(fileList, function(x){file.exists(x)})) == FALSE)){
idx = which(do.call(c, lapply(fileList, function(x){file.exists(x)})) == FALSE)
print("error: these files does not exist")
print(fileList[idx])
stop()
}
# check if files are all snap files
if(any(do.call(c, lapply(fileList, function(x){isSnapFile(x)})) == FALSE)){
idx = which(do.call(c, lapply(fileList, function(x){isSnapFile(x)})) == FALSE)
print("error: these files are not snap file")
print(fileList[idx])
stop()
}
# check if FM session exist
if(any(do.call(c, lapply(fileList, function(x){ "FM" %in% h5ls(x, recursive=1)$name })) == FALSE)){
idx = which(do.call(c, lapply(fileList, function(x){ "FM" %in% h5ls(x, recursive=1)$name })) == FALSE)
print("error: the following nsap files do not contain FM session")
print(fileList[idx])
stop()
}
if(nclusters == 0L){
stop("obj does not have cluster, runCluster first")
}
if(missing(output.prefix)){
stop("output.prefix is missing");
}
if(missing(path.to.snaptools)){
stop("path.to.snaptools is missing");
}else{
if(!file.exists(path.to.snaptools)){
stop("path.to.snaptools does not exist");
}
flag = tryCatch({
file_test('-x', path.to.snaptools);
},
error=function(cond){
return(FALSE)
})
if(flag == FALSE){
stop("path.to.snaptools is not an excutable file");
}
}
if(missing(path.to.macs)){
stop("path.to.macs is missing");
}else{
if(!file.exists(path.to.macs)){
stop("path.to.macs does not exist");
}
flag = tryCatch({
file_test('-x', path.to.macs);
},
error=function(cond){
return(FALSE)
})
if(flag == FALSE){
stop("path.to.macs is not an excutable file");
}
}
if(missing(gsize)){
stop("gsize is missing");
}
if(missing(tmp.folder)){
stop("tmp.folder is missing")
}else{
if(!dir.exists(tmp.folder)){
stop("tmp.folder does not exist");
}
}
peak.ls <- parallel::mclapply(as.list(levels(obj@cluster)), function(x){
num.cells = length(which(obj@cluster == x));
if(num.cells < min.cells){
return(GenomicRanges::GRanges())
}
peaks.df <- runMACS(
obj=obj[which(obj@cluster==x),],
output.prefix=paste(output.prefix, x, sep="."),
path.to.snaptools=path.to.snaptools,
path.to.macs=path.to.macs,
gsize=gsize,
buffer.size=buffer.size,
macs.options=macs.options,
tmp.folder=tmp.folder,
keep.minimal=keep.minimal,
num.cores=1
);
if((x=nrow(peaks.df)) > 0L){
peaks.gr = GenomicRanges::GRanges(peaks.df[,1], IRanges(peaks.df[,2], peaks.df[,3]));
}else{
peaks.gr = GenomicRanges::GRanges();
}
}, mc.cores=num.cores)
peaks.gr = GenomicRanges::reduce(do.call(c, peak.ls));
return(peaks.gr);
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.