#' Extract and summarize methylation or coverage info by regions of interest
#' @details Takes \code{\link{methrix}} object and summarizes regions
#' @param m \code{\link{methrix}} object
#' @param regions genomic regions to be summarized. Could be a data.table with 3 columns (chr, start, end) or a \code{GenomicRanges} object
#' @param type matrix which needs to be summarized. Coule be `M`, `C`. Default 'M'
#' @param how mathematical function by which regions should be summarized. Can be one of the following: mean, sum, max, min. Default 'mean'
#' @param overlap_type defines the type of the overlap of the CpG sites with the target region. Default value is `within`. For detailed description,
#' see the \code{findOverlaps} function of the \code{\link{IRanges}} package.
#' @param na_rm Remove NA's? Default \code{TRUE}
#' @param elementMetadata.col columns in \code{rowData(\link{methrix})} which needs to be summarised. Default = NULL.
#' @param n_chunks Number of chunks to split the \code{\link{methrix}} object in case it is very large. Default = 1.
#' @param n_cores Number of parallel instances. \code{n_cores} should be less than or equal to \code{n_chunks}. If \code{n_chunks} is not specified, then \code{n_chunks} is initialized to be equal to \code{n_cores}. Default = 1.
#' @param verbose Default TRUE
#' @return a coverage or methylation matrix
#' @examples
#' data('methrix_data')
#' get_region_summary(m = methrix_data,
#' regions = data.table(chr = 'chr21', start = 27867971, end = 27868103),
#' type = 'M', how = 'mean')
#' @export
get_region_summary = function(m, regions = NULL, type = "M", how = "mean", overlap_type = "within",
na_rm = TRUE, elementMetadata.col = NULL, verbose = TRUE, n_chunks=1, n_cores=1){
yid <- NULL
if (n_cores > n_chunks){
n_chunks <- n_cores
message("n_cores should be set to be less than or equal to n_chunks.","\n","n_chunks has been set to be equal to n_cores = ", n_cores)
}
type = match.arg(arg = type, choices = c('M', 'C'))
how = match.arg(arg = how, choices = c('mean', 'median', 'max', 'min', 'sum', 'sd'))
start_proc_time = proc.time()
if("data.table" %in% class(regions)){
regions <- GenomicRanges::makeGRangesFromDataFrame(df = regions)
}
target_regions = (regions)
#Add a unique id for every target range (i.e, rows)
mcols(target_regions)$rid <- paste0("rid_", 1:length(target_regions))
r_dat = as.data.frame(rowData(x = m))
r_dat$seqnames <- as.character(r_dat$chr)
r_dat$chr <- NULL
if(is.null(r_dat$end)){
r_dat$end <- r_dat$start + 1
}
r_dat <- GenomicRanges::makeGRangesFromDataFrame(r_dat, keep.extra.columns = FALSE)
if(!all(elementMetadata.col %in% colnames(rowData(m)))) stop("variables provided to elementMetadata.col not correct")
if(verbose){
message("-Checking for overlaps..\n")
}
overlap_indices = as.data.table(findOverlaps(r_dat, target_regions, type = overlap_type))
colnames(overlap_indices) <- c("xid", "yid")
if(nrow(overlap_indices) == 0){
stop("No overlaps detected")
}
overlap_indices[,yid := paste0("rid_", yid)]
n_overlap_cpgs = overlap_indices[, .N, yid]
colnames(n_overlap_cpgs) = c('rid', 'n_overlap_CpGs')
#overlap_indices = split(overlap_indices, as.factor(as.character(overlap_indices$yid)))
if(n_chunks==1){
if (type == "M") {
dat = get_matrix(m = m[overlap_indices$xid,], type = "M", add_loci = TRUE)
} else if (type == "C") {
dat = get_matrix(m = m[overlap_indices$xid,], type = "C", add_loci = TRUE)
}
} else {
#To handle the situation where the number of overlapping sites are lower than the n_chunks
if(nrow(overlap_indices) < n_chunks){
n_chunks <- nrow(overlap_indices)
if (n_cores > n_chunks){
n_cores <- n_chunks
#message("n_cores should be set to be less than or equal to n_chunks.","\n","n_chunks has been set to be equal to n_cores = ", n_cores)
}
}
if (type == "M") {
dat = tryCatch(do.call("rbind",mclapply(mc.cores=n_cores,
split(overlap_indices$xid, ceiling(seq_along(overlap_indices$xid)/ceiling(length(overlap_indices$xid)/n_chunks))),
function(i) {
get_matrix(m[i,], type = "M", add_loci = TRUE)
})),
error = function(e){
message( e, "\n Try to use less cores. ")
})
} else if (type == "C") {
dat = tryCatch(do.call("rbind",mclapply(mc.cores=n_cores, split(overlap_indices$xid, ceiling(seq_along(overlap_indices$xid)/ceiling(length(overlap_indices$xid)/n_chunks))), function(i) {
get_matrix(m[i,], type = "C", add_loci = TRUE)
})),
error = function(e){
message( e, "\n Try to use less cores. ")
})
}
}
if(nrow(overlap_indices) != nrow(dat)){
stop("Something went wrong")
}
dat = cbind(overlap_indices, dat)
#message("-Summarizing overlaps..\n")
if(how == "mean") {
message("-Summarizing by average\n")
output = dat[, lapply(.SD, mean, na.rm = na_rm), by = yid, .SDcols = c(elementMetadata.col,rownames(colData(m)))]
} else if (how == "median") {
message("-Summarizing by median\n")
output = dat[, lapply(.SD, median, na.rm = na_rm), by = yid, .SDcols = c(elementMetadata.col,rownames(colData(m)))]
} else if (how == "max") {
message("-Summarizing by maximum\n")
output = dat[, lapply(.SD, max, na.rm = na_rm), by = yid, .SDcols = c(elementMetadata.col,rownames(colData(m)))]
} else if (how == "min") {
message("-Summarizing by minimum\n")
output = dat[, lapply(.SD, min, na.rm = na_rm), by = yid, .SDcols = c(elementMetadata.col,rownames(colData(m)))]
} else if (how == "sum") {
message("-Summarizing by sum\n")
output = dat[, lapply(.SD, sum, na.rm = na_rm), by = yid, .SDcols = c(elementMetadata.col,rownames(colData(m)))]
}
output = merge(target_regions, output, by.x = 'rid', by.y = 'yid', all.x = TRUE)
output = merge(n_overlap_cpgs, output, by = 'rid')
output$rid <- as.numeric(gsub("rid_","",output$rid))
output <- output[order(output$rid),]
setnames(output, "seqnames", "chr")
keep <- c("chr", "start", "end", "n_overlap_CpGs", "rid", elementMetadata.col, colnames(m))
output <- output[, ..keep]
if(verbose){
message("-Done! Finished in:",data.table::timetaken(start_proc_time),"\n")
}
return(output)
}
#--------------------------------------------------------------------------------------------------------------------------
#' Order mathrix object by SD
#' @details Takes \code{\link{methrix}} object and reorganizes the data by standard deviation
#' @param m \code{\link{methrix}} object
#' @return An object of class \code{\link{methrix}}
#' @examples
#' data('methrix_data')
#' order_by_sd(m = methrix_data)
#' @export
order_by_sd <- function(m) {
if (!is(m, "methrix")){
stop("A valid methrix object needs to be supplied.")
}
if (is_h5(m)) {
sds <- DelayedMatrixStats::rowSds(x = get_matrix(m, type = "M"))
} else {
sds <- matrixStats::rowSds(x = get_matrix(m, type = "M"))
}
row_order <- order(sds, na.last = TRUE, decreasing = TRUE)
m <- m[row_order, ]
m
}
#--------------------------------------------------------------------------------------------------------------------------
#' Subsets \code{\link{methrix}} object based on given conditions.
#' @details Takes \code{\link{methrix}} object and filters CpGs based on coverage statistics
#' @param m \code{\link{methrix}} object
#' @param regions genomic regions to subset by. Could be a data.table with 3 columns (chr, start, end) or a \code{GenomicRanges} object
#' @param contigs chromosome names to subset by
#' @param samples sample names to subset by
#' @param overlap_type defines the type of the overlap of the CpG sites with the target region. Default value is `within`. For detailed description,
#' see the \code{foverlaps} function of the \code{\link{data.table}} package.
#' @examples
#' data('methrix_data')
#' #Subset to chromosome 1
#' subset_methrix(methrix_data, contigs = 'chr21')
#' @return An object of class \code{\link{methrix}}
#' @export
subset_methrix <- function(m, regions = NULL, contigs = NULL, samples = NULL, overlap_type="within") {
if (!is(m, "methrix")){
stop("A valid methrix object needs to be supplied.")
}
r_dat <- data.table::as.data.table(rowData(m))
if (!is.null(regions)) {
message("-Subsetting by genomic regions")
target_regions <- cast_ranges(regions)
r_dat[, `:=`(end, start + 1)]
#data.table::setDT(x = r_dat, key = c("chr", "start", "end"))
overlaps <- data.table::foverlaps(x = r_dat, y = target_regions,
type = overlap_type, nomatch = NULL, which = TRUE)
if (nrow(overlaps) == 0) {
stop("Subsetting resulted in zero entries")
}
m <- m[overlaps$xid, ]
}
if (!is.null(contigs)) {
message("-Subsetting by contigs")
selected_rows <- which(r_dat$chr %in% contigs)
if (length(selected_rows) == 0) {
stop("Subsetting resulted in zero entries")
}
m <- m[selected_rows, ]
}
if (!is.null(samples)) {
message("Subsetting by samples")
samples <- which(rownames(colData(m)) %in% samples)
if (length(samples) == 0) {
stop("None of the samples are present in the object")
}
m <- m[, samples]
}
return(m)
}
#--------------------------------------------------------------------------------------------------------------------------
#' Filter matrices by coverage
#' @details Takes \code{\link{methrix}} object and filters CpGs based on coverage statistics
#' @param m \code{\link{methrix}} object
#' @param cov_thr minimum coverage required to call a loci covered
#' @param min_samples Minimum number of samples that should have a loci with coverage >= \code{cov_thr}. If \code{group} is given, then this applies per group. Only need one of \code{prop_samples} or \code{min_samples}.
#' @param prop_samples Minimum proportion of samples that should have a loci with coverage >= \code{cov_thr}. If \code{group} is given, then this applies per group. Only need one of \code{prop_samples} or \code{min_samples}.
#' @param group a column name from sample annotation that defines groups. In this case, the number of min_samples will be
#' tested group-wise.
#' @param n_chunks Number of chunks to split the \code{\link{methrix}} object in case it is very large. Default = 1.
#' @param n_cores Number of parallel instances. \code{n_cores} should be less than or equal to \code{n_chunks}. If \code{n_chunks} is not specified, then \code{n_chunks} is initialized to be equal to \code{n_cores}. Default = 1.
#' @importFrom methods is as new
#' @examples
#' data('methrix_data')
#' #keep only CpGs which are covered by at-least 1 read across 3 samples
#' coverage_filter(m = methrix_data, cov_thr = 1, min_samples = 3)
#' @return An object of class \code{\link{methrix}}
#' @export
coverage_filter <- function(m, cov_thr = 1, min_samples = 1, prop_samples=0, group = NULL, n_chunks=1, n_cores=1) {
start_proc_time <- proc.time()
V1 <- . <- col2 <- Count2 <- i.to <- NULL
if (!is(m, "methrix")){
stop("A valid methrix object needs to be supplied.")
}
if (!is.numeric(cov_thr)){
stop("cov_thr is not numeric.")
}
if (!(is.numeric(min_samples) & is.numeric(prop_samples))){
stop("min_samples and prop_samples variables are not numeric.")
}
if (!is.null(group) && !(group %in% colnames(colData(m)))){
stop(paste("The column name ", group, " can't be found in colData. Please provid a valid group column."))
}
if (n_cores > n_chunks){
n_chunks <- n_cores
message("n_cores should be set to be less than or equal to n_chunks.", "\n", "n_chunks has been set to be equal to n_cores = ", n_cores)
}
if (is_h5(m)) {
if (n_chunks == 1) {
cov_dat = get_matrix(m = m, type = "C")
if (!is.null(group)) {
row_idx <- sapply(unique(colData(m)[, group]), function(c) {
res <- DelayedMatrixStats::rowSums2(cov_dat[, colData(m)[,
group] == c] >= cov_thr, na.rm = TRUE)
row_idx <- (res >= max(min_samples, ceiling(prop_samples *
sum(colData(m)[, group] == c))))
})
row_idx <- DelayedMatrixStats::rowAlls(row_idx)
} else {
res <- DelayedMatrixStats::rowSums2(cov_dat >= cov_thr, na.rm = TRUE)
row_idx <- (res >= max(min_samples, ceiling(prop_samples *
ncol(cov_dat))))
}
} else {
if (!is.null(group)) {
row_idx <- unlist(mclapply(mc.cores = n_cores, 1:n_chunks,
function(i) {
cov_dat = get_matrix(m[((i - 1) * ceiling(nrow(m)/n_chunks) + 1):min(i * ceiling(nrow(m)/n_chunks), nrow(m)), ],
type = "C")
row_idx <- sapply(unique(colData(m)[, group]), function(c) {
res <- DelayedMatrixStats::rowSums2(cov_dat[, colData(m)[,group] == c] >= cov_thr, na.rm = TRUE)
row_idx <- (res >= max(min_samples, ceiling(prop_samples * sum(colData(m)[, group] == c))))
})
row_idx <- DelayedMatrixStats::rowAlls(row_idx)
}))
} else {
row_idx <- unlist(mclapply(mc.cores = n_cores, 1:n_chunks,
function(i) {
cov_dat = get_matrix(m[((i - 1) * ceiling(nrow(m)/n_chunks) + 1):min(i * ceiling(nrow(m)/n_chunks), nrow(m)), ],
type = "C")
res <- DelayedMatrixStats::rowSums2(cov_dat >= cov_thr, na.rm = TRUE)
row_idx <- (res >= max(min_samples, ceiling(prop_samples * ncol(cov_dat))))
}))
}
}
} else {
cov_dat = get_matrix(m = m, type = "C")
if (!is.null(group)) {
row_idx <- sapply(unique(colData(m)[, group]), function(c) {
res <- matrixStats::rowSums2(cov_dat[, colData(m)[, group] ==
c] >= cov_thr, na.rm = TRUE)
row_idx <- (res >= max(min_samples, ceiling(prop_samples * sum(colData(m)[, group] == c))))
})
row_idx <- matrixStats::rowAlls(row_idx)
} else {
res <- matrixStats::rowSums2(cov_dat >= cov_thr, na.rm = T)
row_idx <- (res >= max(min_samples, ceiling(prop_samples * ncol(cov_dat))))
}
}
gc()
message(paste0("-Retained ", format(sum(row_idx), big.mark = ","),
" of ", format(nrow(m), big.mark = ","), " sites"))
message("-Finished in: ", data.table::timetaken(start_proc_time))
return(m[row_idx, ])
}
#--------------------------------------------------------------------------------------------------------------------------
#' Extract methylation or coverage matrices
#' @details Takes \code{\link{methrix}} object and returns user specified \code{methylation} or \code{coverage} matrix
#' @param m \code{\link{methrix}} object
#' @param type can be \code{M} or \code{C}. Default 'M'
#' @param add_loci Default FALSE. If TRUE adds CpG position info to the matrix and returns as a data.table
#' @param in_granges Do you want the outcome in \code{GRanges}?
#' @return Coverage or Methylation matrix
#' @examples
#' data('methrix_data')
#' #Get methylation matrix
#' get_matrix(m = methrix_data, type = 'M')
#' #Get methylation matrix along with loci
#' get_matrix(m = methrix_data, type = 'M', add_loci = TRUE)
#' #' #Get methylation data as a GRanges object
#' get_matrix(m = methrix_data, type = 'M', add_loci = TRUE, in_granges=TRUE)
#' @export
get_matrix <- function(m, type = "M", add_loci = FALSE, in_granges=FALSE) {
if (!is(m, "methrix")){
stop("A valid methrix object needs to be supplied.")
}
type <- match.arg(arg = type, choices = c("M", "C"))
if (add_loci == FALSE & in_granges == TRUE) {
warning("Without genomic locations (add_loci= FALSE), it is not possible to convert the results to GRanges, ",
"the output will be a data.table object. ")
}
if (type == "M") {
d <- SummarizedExperiment::assay(x = m, i = 1)
} else {
d <- SummarizedExperiment::assay(x = m, i = 2)
}
if (add_loci) {
if (is_h5(m)) {
d <- as.data.frame(cbind(SummarizedExperiment::rowData(x = m),
as.data.frame(d)))
} else {
d <- as.data.frame(cbind(SummarizedExperiment::rowData(x = m),
d))
}
if (in_granges){
d$end <- d$start +1
d <- GenomicRanges::makeGRangesFromDataFrame(d, keep.extra.columns = TRUE)
} else {
data.table::setDT(x = d)
}
}
d
}
#--------------------------------------------------------------------------------------------------------------------------
#' Convert \code{\link{methrix}} to \code{bsseq} object
#' @details Takes \code{\link{methrix}} object and returns a \code{bsseq} object
#' @param m \code{\link{methrix}} object
#' @return An object of class \code{bsseq}
#' @examples
#' \dontrun{
#' data('methrix_data')
#' methrix2bsseq(m = methrix_data)
#' }
#' @export
#'
methrix2bsseq <- function(m) {
if (!is(m, "methrix")){
stop("A valid methrix object needs to be supplied.")
}
n_samps <- nrow(SummarizedExperiment::colData(x = m))
M_clean <- get_matrix(m) * get_matrix(m, type = "C")
M_clean[is.na(M_clean)] <- 0
assays(m)[[2]][is.na(assays(m)[[2]])] <- 0
b <- bsseq::BSseq(M = M_clean, Cov = get_matrix(m, type = "C"), pData = colData(x = m),
pos = rowData(x = m)[, "start"], chr = rowData(x = m)[, "chr"],
sampleNames = rownames(colData(m)))
b
}
#--------------------------------------------------------------------------------------------------------------------------
#' Remove loci that are uncovered across all samples
#' @details Takes \code{\link{methrix}} object and removes loci that are uncovered across all samples
#' @param m \code{\link{methrix}} object
#' @return An object of class \code{\link{methrix}}
#' @examples
#' data('methrix_data')
#' remove_uncovered(m = methrix_data)
#' @export
#'
remove_uncovered <- function(m) {
start_proc_time <- proc.time()
if (!is(m, "methrix")){
stop("A valid methrix object needs to be supplied.")
}
row_idx<-rowSums(!is.na(assays(m)[[2]]))==0
message(paste0("-Removed ", format(sum(row_idx), big.mark = ","),
" [", round(sum(row_idx)/nrow(m) * 100, digits = 2), "%] uncovered loci of ",
format(nrow(m), big.mark = ","), " sites"))
gc()
message("-Finished in: ", data.table::timetaken(start_proc_time))
if (sum(row_idx) == 0){
m
} else {
m[!row_idx, ]
}
}
#--------------------------------------------------------------------------------------------------------------------------
#' Filter matrices by region
#' @details Takes \code{\link{methrix}} object and filters CpGs based on supplied regions in data.table or GRanges format
#' @param m \code{\link{methrix}} object
#' @param regions genomic regions to filter-out. Could be a data.table with 3 columns (chr, start, end) or a \code{GenomicRanges} object
#' @param type defines the type of the overlap of the CpG sites with the target regions. Default value is `within`. For detailed description,
#' see the \code{foverlaps} function of the \code{\link{data.table}} package.
#' @return An object of class \code{\link{methrix}}
#' @examples
#' data('methrix_data')
#' region_filter(m = methrix_data,
#' regions = data.table(chr = 'chr21', start = 27867971, end = 27868103))
#' @export
region_filter <- function(m, regions, type = "within") {
start_proc_time <- proc.time()
if (!is(m, "methrix")){
stop("A valid methrix object needs to be supplied.")
}
target_regions <- cast_ranges(regions)
current_regions <- data.table::as.data.table(rowData(m))
current_regions[, `:=`(end, start + 1)]
data.table::setDT(x = current_regions, key = c("chr", "start", "end"))
overlap <- data.table::foverlaps(x = current_regions, y = target_regions,
type = type, nomatch = NULL, which = TRUE)
if (nrow(overlap) == 0) {
stop("No CpGs found within the query intervals. Nothing to remove.")
}
message(paste0("-Removed ", format(nrow(overlap), big.mark = ","),
" CpGs"))
message("-Finished in: ", data.table::timetaken(start_proc_time))
m[-overlap$xid, ]
}
#--------------------------------------------------------------------------------------------------------------------------
#' Masks too high or too low coverage
#' @details Takes \code{\link{methrix}} object and masks sites with too high or too low coverage
#' by putting NA for coverage and beta value. The sites will remain in the object.
#' @param m \code{\link{methrix}} object
#' @param low_count The minimal coverage allowed. Everything below, will get masked. Default = NULL, nothing gets masked.
#' @param high_quantile The quantile limit of coverage. Quantiles are calculated for each sample and everything that belongs to a
#' higher quantile than the defined will be masked. Default = 0.99.
#' @param n_cores Number of parallel instances. Can only be used if \code{\link{methrix}} is in HDF5 format. Default = 1.
#' @return An object of class \code{\link{methrix}}
#' @examples
#' data('methrix_data')
#' mask_methrix(m = methrix_data, low_count = 5, high_quantile = 0.99 )
#' @export
mask_methrix <- function(m, low_count = NULL, high_quantile = 0.99, n_cores=1) {
start_proc_time <- proc.time()
if (!is(m, "methrix")){
stop("A valid methrix object needs to be supplied.")
}
if (!is_h5(m) & n_cores != 1) {
stop("Parallel processing not supported for a non-HDF5 methrix object due to probable high memory usage. \nNumber of cores (n_cores) needs to be 1.")
}
if (!is.null(low_count)) {
if(!is.numeric(low_count)){
stop("low_count must be a numeric value.")
}
message("-Masking coverage lower than ", low_count)
row_idx1 <- (assays(m)[[2]] < low_count)
assays(m)[[1]][row_idx1] <- as.double(NA)
assays(m)[[2]][row_idx1] <- as.integer(NA)
if (is_h5(m)) {
if (n_cores == 1){
n <- DelayedMatrixStats::colSums2(row_idx1, na.rm = T)}
else {
n <- simplify2array(mclapply(mc.cores = n_cores, 1:ncol(row_idx1), function(i) sum(row_idx1[, i], na.rm = T)))}
} else {
n <- colSums(row_idx1, na.rm = T)
}
for (i in seq_along(colnames(m))) {
message(paste0("-Masked ", n[i], " CpGs due to too low coverage in sample ", colnames(row_idx1)[i], "."))
}
}
if (!is.null(high_quantile)) {
if (high_quantile >= 1 | high_quantile <= 0) {
stop("High quantile should be between 0 and 1. ")
}
message("\n\n-Masking coverage higher than ", high_quantile*100," percentile")
if (is_h5(m)) {
if (n_cores == 1){
quantiles <- DelayedMatrixStats::colQuantiles(assays(m)[[2]], probs = high_quantile, na.rm = TRUE, drop = F)}
else {
quantiles <- simplify2array(mclapply(mc.cores = n_cores, 1:ncol(assays(m)[[2]]),
function(i) quantile(assays(m)[[2]][, i], probs = high_quantile, na.rm = TRUE)))}
quantiles <- as.vector(quantiles)
names(quantiles) <- rownames(colData(m))
} else {
quantiles <- matrixStats::colQuantiles(assays(m)[[2]], probs = high_quantile, na.rm = TRUE)
quantiles <- as.vector(quantiles)
names(quantiles) <- rownames(colData(m))
}
row_idx2 <- t(t((assays(m)[[2]])) > quantiles)
assays(m)[[1]][row_idx2] <- as.double(NA)
assays(m)[[2]][row_idx2] <- as.integer(NA)
if (is_h5(m)) {
if (n_cores == 1){
n <- DelayedMatrixStats::colSums2(row_idx2, na.rm = T)}
else {
n <- simplify2array(mclapply(mc.cores = n_cores, 1:ncol(row_idx2),
function(i) sum(row_idx2[, i], na.rm = T)))}
} else {
n <- colSums(row_idx2, na.rm = T)
}
for (i in seq_along(colnames(m))) {
message(paste0("-Masked ", n[i], " CpGs due to too high coverage in sample ",
colnames(row_idx2)[i], "."))
}
}
message("-Finished in: ", data.table::timetaken(start_proc_time))
return(m)
}
#--------------------------------------------------------------------------------------------------------------------------
#' Combine methrix objects
#' @details Takes two \code{\link{methrix}} objects and combines them row- or column-wise
#' @param m1 Frist \code{\link{methrix}} object
#' @param m2 Second \code{\link{methrix}} object
#' @param by The direction of combine. 'column' (cbind) combines samples with same regions, 'row' combines different regions,
#' e.g. different chromosomes.
#' @return An object of class \code{\link{methrix}}
#' @export
#'
combine_methrix <- function(m1, m2, by = c("row", "col")) {
if (!is(m1, "methrix") || !is(m2, "methrix") ){
stop("Valid methrix objects need to be supplied.")
}
by <- match.arg(arg = by, choices = c("row", "col"), several.ok = FALSE)
if (by == "row") {
if (nrow(colData(m1))!= nrow(colData(m2)) || !(all(rownames(colData(m1)) == rownames(colData(m2))))) {
stop("You have different samples in your dataset. You need the same samples in your datasets. ")
} else {
m <- rbind(m1, m2)
}
if (any(duplicated((as.data.table(rowData(m)))))) {
stop("There are overlapping regions in your datasets. This function only takes distinct objects. ")
}
}
if (by == "col") {
if (any(rownames(colData(m1)) %in% rownames(colData(m2)))) {
stop("You have the same samples in your datasets. You need different samples for this merging. ")
} else if (!identical(rowData(m1), rowData(m2))) {
stop("You have to have the same regions in your datasets. ")
} else {
m <- cbind(m1, m2)
}
}
gc()
return(m)
}
#--------------------------------------------------------------------------------------------------------------------------
#' Estimate descriptive statistics
#' @details Calculate descriptive statistics
#' @param m \code{\link{methrix}} object
#' @param per_chr Estimate stats per chromosome. Default TRUE
#' @seealso \code{\link{plot_stats}}
#' @examples
#' data('methrix_data')
#' get_stats(methrix_data)
#' @return data.table of summary stats
#' @export
get_stats <- function(m, per_chr = TRUE) {
median <- . <- sd <- chr <- NULL
start_proc_time <- proc.time()
if (!is(m, "methrix")){
stop("A valid methrix object needs to be supplied.")
}
#Counting NA's seems to be not-ideal (especially in cases of masked/coverage filtered cases.)
# row_idx <- data.table::as.data.table(which(is.na(get_matrix(m = m,
# type = "C")), arr.ind = TRUE))
# colnames(row_idx) <- c("row", "col")
# row_idx <- split(row_idx, as.factor(as.character(row_idx$col)))
if (per_chr) {
cov_stat <- lapply(1:ncol(m), function(i){
get_matrix(m = m[,i], type = "C", add_loci = TRUE)[, c(1, 4), with = FALSE][, .(
mean_cov = lapply(.SD,
matrixStats::mean2, na.rm = TRUE),
median_cov = lapply(.SD,
median, na.rm = TRUE),
sd_cov = lapply(.SD, sd, na.rm = TRUE)
),
by = chr]
})
meth_stat <- lapply(1:ncol(m), function(i){
get_matrix(m = m[,i], type = "M", add_loci = TRUE)[, c(1, 4), with = FALSE][, .(
mean_meth = lapply(.SD,
matrixStats::mean2, na.rm = TRUE),
median_meth = lapply(.SD,
median, na.rm = TRUE),
sd_meth = lapply(.SD, sd, na.rm = TRUE)
),
by = chr]
})
names(meth_stat) <- colnames(m)
names(cov_stat) <- colnames(m)
cov_stat <- data.table::rbindlist(l = cov_stat, use.names = TRUE,
idcol = "Sample_Name")
meth_stat <- data.table::rbindlist(l = meth_stat, use.names = TRUE,
idcol = "Sample_Name")
stats <- merge(meth_stat, cov_stat, by = c("chr", "Sample_Name"))
colnames(stats)[1] <- "Chromosome"
stats$Chromosome <- factor(x = stats$Chromosome, levels = S4Vectors::metadata(m)$chrom_sizes$contig)
} else {
if (is_h5(m)) {
cov_stat <- data.table::data.table(Sample_Name = colnames(m),
mean_cov = DelayedMatrixStats::colMeans2(get_matrix(m = m, "C"), na.rm = TRUE),
median_cov = DelayedMatrixStats::colMedians(get_matrix(m = m, "C"), na.rm = TRUE),
sd_cov = DelayedMatrixStats::colSds(get_matrix(m = m, "C"), na.rm = TRUE)
)
meth_stat <- data.table::data.table(Sample_Name = colnames(m),
mean_meth = DelayedMatrixStats::colMeans2(get_matrix(m = m, "M"), na.rm = TRUE),
median_meth = DelayedMatrixStats::colMedians(get_matrix(m = m, "M"), na.rm = TRUE),
sd_meth = DelayedMatrixStats::colSds(get_matrix(m = m, "M"), na.rm = TRUE)
)
} else {
cov_stat <- data.table::data.table(Sample_Name = colnames(m),
mean_cov = matrixStats::colMeans2(get_matrix(m = m, "C"), na.rm = TRUE),
median_cov = matrixStats::colMedians(get_matrix(m = m, "C"), na.rm = TRUE),
sd_cov = matrixStats::colSds(get_matrix(m = m, "C"), na.rm = TRUE)
)
meth_stat <- data.table::data.table(Sample_Name = colnames(m),
mean_meth = matrixStats::colMeans2(get_matrix(m = m, "M"), na.rm = TRUE),
median_meth = matrixStats::colMedians(get_matrix(m = m, "M"), na.rm = TRUE),
sd_meth = matrixStats::colSds(get_matrix(m = m, "M"), na.rm = TRUE)
)
}
stats <- merge(meth_stat, cov_stat, by = c("Sample_Name"))
}
gc()
message("-Finished in: ", data.table::timetaken(start_proc_time))
stats
}
#--------------------------------------------------------------------------------------------------------------------------
#' Saves HDF5 methrix object
#' @details Takes \code{\link{methrix}} object and saves it
#' @param m \code{\link{methrix}} object
#' @param dir The directory to use. Created, if not existing. Default NULL
#' @param replace Should it overwrite the pre-existing data? FALSE by default.
#' @param ... Parameters to pass to saveHDF5SummarizedExperiment
#' @examples
#' data('methrix_data')
#' methrix_data_h5 <- convert_methrix(m=methrix_data)
#' target_dir = paste0(getwd(), '/temp/')
#' save_HDF5_methrix(methrix_data_h5, dir = target_dir, replace = TRUE)
#' @return Nothing
#' @export
save_HDF5_methrix <- function(m = NULL, dir = NULL, replace = FALSE, ...) {
if (is.null(dir)) {
stop("Please provide an target directory to save the results")
}
if (is(m, "methrix") && is_h5(m)) {
HDF5Array::saveHDF5SummarizedExperiment(x = m, dir = dir, replace = replace,
...)
} else {
stop("The object is not a methrix object or not in an HDF5 format. ")
}
}
#--------------------------------------------------------------------------------------------------------------------------
#' Loads HDF5 methrix object
#' @details Takes directory with a previously saved HDF5Array format \code{\link{methrix}} object and loads it
#' @param dir The directory to read in from. Default NULL
#' @param ... Parameters to pass to loadHDF5SummarizedExperiment
#' @return An object of class \code{\link{methrix}}
#' @examples
#' data('methrix_data')
#' methrix_data_h5 <- convert_methrix(m=methrix_data)
#' target_dir = paste0(getwd(), '/temp1/')
#' save_HDF5_methrix(methrix_data_h5, dir = target_dir, replace = TRUE)
#' load_HDF5_methrix(target_dir)
#' @export
load_HDF5_methrix <- function(dir = NULL, ...) {
if (is.null(dir)) {
stop("Please provide the target directory containing ")
}
m <- HDF5Array::loadHDF5SummarizedExperiment(dir = dir, ...)
m <- as(m, "methrix")
return(m)
}
#--------------------------------------------------------------------------------------------------------------------------
#' Converts HDF5 methrix object to standard in-memory object.
#' @details Takes a \code{\link{methrix}} object and returns with the same object with in-memory assay slots.
#' @param m An object of class \code{\link{methrix}}, HDF5 format
#' @return An object of class \code{\link{methrix}}
#' @examples
#' data(methrix_data)
#' m2 <- convert_methrix(m=methrix_data)
#' m <- convert_HDF5_methrix(m=m2)
#' @export
convert_HDF5_methrix <- function(m = NULL) {
if (is.null(m) | !is(m, "methrix")) {
stop("No or not valid input data provided.")
}
if (!is_h5(m)) {
stop("The input data is not in HDF5 format. No conversion happened.")
}
assays(m)[[1]] <- as.matrix(assays(m)[[1]])
assays(m)[[2]] <- as.matrix(assays(m)[[2]])
S4Vectors::metadata(m)$is_h5 <- FALSE
return(m)
}
#--------------------------------------------------------------------------------------------------------------------------
#' Converts an in-memory object to an on-disk HDF5 object.
#' @details Takes a \code{\link{methrix}} object and returns with the same object with delayed array assay slots
#' with HDF5 backend. Might take long time!
#' @param m An object of class \code{\link{methrix}}
#' @return An object of class \code{\link{methrix}}, HDF5 format
#' @examples
#' data(methrix_data)
#' m2 <- convert_methrix(m=methrix_data)
#' @export
convert_methrix <- function(m = NULL) {
if (is.null(m) | !is(m, "methrix")) {
stop("No or not valid input data provided.")
}
if (is_h5(m)) {
stop("The input data is already in HDF5 format. No conversion happened.")
}
m <- create_methrix(beta_mat = assays(m)[[1]], cov_mat = assays(m)[[2]],
cpg_loci = rowData(m), is_hdf5 = TRUE, genome_name = S4Vectors::metadata(m)$genome,
col_data = colData(m), chrom_sizes = S4Vectors::metadata(m)$chrom_sizes, ref_cpg_dt = S4Vectors::metadata(m)$ref_CpG,
desc = S4Vectors::metadata(m)$descriptive_stats)
return(m)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.