Nothing
#########################################################
# Utility functions
# Conversion functions in 2nd half of file
########################################################
# This function is a drop-in replacement for the base list() function,
# which automatically names your list according to the names of the
# variables used to construct it.
# It seamlessly handles lists with some names and others absent,
# not overwriting specified names while naming any unnamed parameters.
#
# @param ... arguments passed to list()
# @return A named list object.
# @examples
# x <- 5
# y <- 10
# nlist(x, y) # returns list(x = 5, y = 10)
# list(x, y) # returns unnamed list(5, 10)
nlist <- function(...) {
fcall <- match.call(expand.dots = FALSE)
l <- list(...);
if (!is.null(names(list(...)))) {
names(l)[names(l) == ""] <- fcall[[2]][names(l) == ""]
} else {
names(l) <- fcall[[2]];
}
return(l)
}
# Function to run lapply or mclapply, depending on the option set in
# getOption("mc.cores"), which can be set with setLapplyAlias().
#
# @param ... Arguments passed lapply() or mclapply()
# @param mc.preschedule Argument passed to mclapply
# @return Result from lapply or parallel::mclapply
lapplyAlias <- function(..., mc.preschedule = TRUE) {
if (is.null(getOption("mc.cores"))) { setLapplyAlias(1) }
if (getOption("mc.cores") > 1) {
return(parallel::mclapply(..., mc.preschedule = mc.preschedule))
} else {
return(lapply(...))
}
}
# To make parallel processing a possibility but not required,
# I use an lapply alias which can point at either the base lapply
# (for no multicore), or it can point to mclapply,
# and set the options for the number of cores (what mclapply uses).
# With no argument given, returns intead the number of cpus currently selected.
#
# @param cores Number of cpus
# @return None
setLapplyAlias <- function(cores = 0) {
if (cores < 1) {
return(getOption("mc.cores"))
}
if (cores > 1) { # use multicore?
if (requireNamespace("parallel", quietly = TRUE)) {
options(mc.cores = cores)
} else {
warning(cleanws("You don't have package parallel installed.
Setting cores to 1."))
options(mc.cores = 1) # reset cores option.
}
} else {
options(mc.cores = 1) # reset cores option.
}
}
# helper function
# given a vector of columns, and the equally-sized vector of functions
# to apply to those columns, constructs a j-expression for use in
# a data.table
# (functions applied to columns in corresponding spot in "cols" string).
# One function may be given to be applied to multiple columns.
# use it in a DT[, eval(parse(text = buildJ(cols, funcs)))]
# @param cols A string/vector of strings containing columns
# on which to use functions.
# @param funcs Functions to use on columns.
# @return A jcommand string. After performing function on column, column
# is reassigned the same name.
buildJ <- function(cols, funcs, newColNames=NULL) {
if (is.null(newColNames)) {
# previously the only option (changed 10/16/17)
r <- paste("list(", paste(paste0(cols, "=", funcs, "(", cols, ")"), collapse = ","), ")")
} else {
r <- paste("list(", paste(paste0(newColNames, "=", funcs, "(", cols, ")"), collapse = ","), ")")
}
return(r);
}
# cleanws takes multi-line, code formatted strings and just formats them
# as simple strings
# @param string string to clean
# @return A string with all consecutive whitespace characters, including
# tabs and newlines, merged into a single space.
cleanws <- function(string) {
return(gsub('\\s+'," ", string))
}
#' Combine data.tables from a named list into one big data.table with extra
#' column for name.
#'
#' Combines data.tables from a list as data.table::rbindlist except an
#' extra column is added which can mark which rows came from which
#' original data.table. The name of this new column can be
#' specified (newColName param) and the values are the names of the input list.
#' This function was originally intended for a list where each
#' data.table corresponds to a separate sample and the list names
#' correspond to the names of the samples. Now all samples can
#' be combined into one data.table but which sample the information
#' came from can be kept.
#'
#' @param namedList A list of data.tables. One sample per data.table/list item.
#' Names of the list are the sample names.
#' @param newColName The name of the column that will be added to the data.tables
#'
#' @return mergedDT A single data.table that combines those in the input list
#' (data.table::rbindlist)
#' but also added a column (default, newColName="sampleName") to each one to
#' keep track of which rows were from which sample
#' @examples
#' # NOTE: generally sample names should be added after running aggregateMethyl()
#' # rather than before in order to save memory (since data.table has less rows
#' # after aggregation)
#' data(exampleBSseqObj)
#' multiSampleList <- bsseqToDataTable(exampleBSseqObj)
#' combinedDT <- rbindNamedList(multiSampleList)
#' @export
rbindNamedList <- function(namedList, newColName="sampleName") {
# making a copy so original object will not be changed by reference
namedListC <- data.table::copy(namedList)
newColVals <- names(namedListC)
# if a not all items in list are named
if (length(newColVals) < length(namedListC)) {
warning("All list items should be named.")
}
# named it .y just in case someone made their new column name (newColName) y
# after testing I don't think that actually is a problem but I'll keep it
jExpr <- paste0(newColName, " := .y")
# by reference
mapply(FUN = function(x, .y) x[, eval(parse(text = jExpr))], namedListC, newColVals)
mergedDT <- rbindlist(namedListC, fill = TRUE)
return(mergedDT)
}
#################### Conversion Functions ########################
# Convert between different object types
# Convert a data.table to GRanges object.
#
# @param DT a data.table with at least "chr" and "start" columns
#
# @return gr A genomic ranges object derived from DT
dtToGr <- function(DT, chr = "chr", start = "start",
end = NA, strand = NA, name = NA,
splitFactor = NA, metaCols = NA) {
if (is.na(splitFactor)) {
return(dtToGrInternal(DT, chr, start, end, strand, name, metaCols));
}
if ( length(splitFactor) == 1 ) {
if (splitFactor %in% colnames(DT)) {
splitFactor <- DT[, get(splitFactor)];
}
}
lapply(split(1:nrow(DT), splitFactor),
function(x) {
dtToGrInternal(DT[x, ], chr, start, end, strand, name, metaCols)
}
)
}
dtToGR <- dtToGr;
# Internal part of a utility to convert data.tables into GRanges objects
#
# @param DT A data.table with at least "chr" and "start" columns
# @return gr A genomic ranges object derived from DT
dtToGrInternal <- function(DT, chr, start,
end = NA, strand = NA, name = NA, metaCols = NA) {
if (is.na(end)) {
if ("end" %in% colnames(DT)) {
end <- "end"
} else {
end <- start;
}
}
if (is.na(strand)) {
if ("strand" %in% colnames(DT)) { # checking if strand info is in DT
strand <- "strand"
}
}
if (is.na(strand)) {
gr <- GRanges(seqnames = DT[[`chr`]],
ranges = IRanges(start = DT[[`start`]], end = DT[[`end`]]),
strand = "*")
} else {
# GRanges can only handle '*' for no strand, so replace any non-accepted
# characters with '*'
DT[, strand := as.character(strand)]
DT[strand == "1", strand := "+"]
DT[strand == "-1", strand := "-"]
DT[[`strand`]] <- gsub("[^+-]", "*", DT[[`strand`]])
gr <- GRanges(seqnames = DT[[`chr`]],
ranges = IRanges(start = DT[[`start`]], end = DT[[`end`]]),
strand = DT[[`strand`]])
}
if (! is.na(name)) {
names(gr) <- DT[[`name`]];
} else {
names(gr) <- seq_along(gr);
}
if (! is.na(metaCols)) {
for(x in metaCols) {
elementMetadata(gr)[[`x`]] <- DT[[`x`]]
}
}
gr;
}
# Converts a list of data.tables into GRanges.
# @param dtList A list of data.tables,
# Each should have "chr", "start", "methylCount", and "coverage" columns.
# Error results if missing "chr", "start" but if methylCount and coverage are
# missing, it will still work, just not have that info in the output.
# @return a list of GRanges objects, strand has been set to "*",
# "start" and "end" have both been set to "start" of the DT.
# methylCount and coverage info is not included in GRanges object.
BSdtToGRanges <- function(dtList) {
gList <- list();
for (i in seq_along(dtList)) {
# dt <- dtList[[i]];
setkey(dtList[[i]], chr, start)
# convert the data into granges object
gList[[i]] <- GRanges(seqnames = dtList[[i]]$chr,
ranges = IRanges(start = dtList[[i]]$start,
end = dtList[[i]]$start),
strand = rep("*", nrow(dtList[[i]])))
# I used to use end = start + 1, but this targets CG instead of just
# a C, and it's causing edge-effects problems when I assign Cs to
# tiled windows using (within). Aug 2014 I'm changing to start/end at
# the same coordinate.
}
return(gList);
}
# Convert a GenomicRanges into a data.table
#
# Also can convert GPos objects to a data.table.
#
# @param GR A GRanges object
# @param includeStrand Boolean, whether to include strand from GR in output DT
# @return A data.table object with columns:
# "chr", "start", and "end" (possibly strand)
grToDt <- function(GR, includeStrand = FALSE) {
DF <- as.data.frame(elementMetadata(GR))
if ( ncol(DF) > 0) {
if (includeStrand) {
DT <- data.table(chr = as.vector(seqnames(GR)),
start = start(GR),
end = end(GR),
strand = as.vector(strand(GR), mode = "character"),
DF)
} else{
DT <- data.table(chr = as.vector(seqnames(GR)),
start = start(GR),
end = end(GR),
DF)
}
} else {
if (includeStrand) {
DT <- data.table(chr = as.vector(seqnames(GR)),
start = start(GR),
end = end(GR),
strand = as.vector(strand(GR), mode = "character"))
} else{
DT <- data.table(chr = as.vector(seqnames(GR)),
start = start(GR),
end = end(GR))
}
}
return(DT)
}
#' Make MIRA-compatible data.tables using information
#' from SummarizedExperiment-based classes
#'
#' Packages that use SummarizedExperiment-based classes for DNA methylation
#' data which could be converted with this function
#' include "bsseq", "methylPipe", and "BiSeq" packages.
#' Of the methylCountDF, coverageDF, and methylPropDF arguments, either
#' methylPropDF must be given or both methylCountDF and coverageDF must
#' be given. After that, whichever is not given will be calculated
#' from the others. If coverageDF and methylCountDF are both not given,
#' coverage will be assumed to be 1 at each site with a methylProp value.
#' Acceptable formats for the three "DF" parameters include:
#' data.frame, data.table, matrix, and DelayedMatrix classes.
#' If converting a BSseq object, see ?bsseqToDataTable for a convenient
#' wrapper of this function.
#'
#' @param coordinates Coordinates for the methylation loci as a GRanges
#' (or GPos) object (in same order as methylCountDF/coverageDF/methylPropDF,
#' whichever is given).
#' The start coordinate should be the coordinate of the cytosine.
#' @param methylCountDF An object of matrix/data.frame/similar format
#' that contains the number of reads with methylated C's for the loci in
#' 'coordinates' argument. It should have one column per sample with
#' rows that correspond to 'coordinates'.
#' @param coverageDF An object of matrix/data.frame/similar format
#' that contains the total number of reads for the loci in
#' 'coordinates' argument. It should have one column per sample with
#' rows that correspond to 'coordinates'.
#' @param methylPropDF An object of matrix/data.frame/similar format
#' that contains the total number of reads for the loci in
#' 'coordinates' argument. It should have one column per sample with
#' rows that correspond to 'coordinates'.
#' @param sample_names A character vector with sample names in the same
#' order as the columns of methylCountDF/coverageDF/methylPropDF
#' samples in the columns
#' @return MIRAFormatBSDTList A list of data.tables containing
#' the methylation data. One data.table per sample with the column
#' names: 'chr', 'start' (methylation coordinate), 'methylCount' (number of
#' methylated reads), 'coverage' (total number of reads), and
#' 'methylProp' (proportion of methylated reads). The order of the
#' list is the order of samples in the columns of
#' methylCountDF/coverageDF/methylPropDF. If sample names
#' are explicitly given as input ('sample_names' argument)
#' or can be derived from other arguments to the function then
#' a named list will be returned.
#' @examples
#' data("exampleBSseqObj")
#' MIRAFormatBSDTList <- SummarizedExperimentToDataTable(coordinates =
#' bsseq::granges(exampleBSseqObj),
#' methylCountDF = bsseq::getCoverage(BSseq = exampleBSseqObj, type = "M"),
#' coverageDF = bsseq::getCoverage(BSseq = exampleBSseqObj, type = "Cov"),
#' methylPropDF = bsseq::getMeth(BSseq = exampleBSseqObj, type = "raw"),
#' sample_names = bsseq::sampleNames(exampleBSseqObj))
#' @export
SummarizedExperimentToDataTable <- function(coordinates, methylCountDF=NULL,
coverageDF=NULL, methylPropDF=NULL,
sample_names=NULL) {
haveMCount <- !is.null(methylCountDF)
haveCov <- !is.null(coverageDF)
haveMProp <- !is.null(methylPropDF)
# making sure sufficient data has been given
if (!haveMProp) {
# if no methylProp given, you need both methylCount and coverage
if (!haveMCount || !haveCov) {
stop(cleanws("If methylProp is not given, both
methylCount and coverage must be given."))
}
}
# checking that right data types have been given
if (!(is(coordinates, "GRanges") || is(coordinates, "GPos"))) {
stop("'coordinates' argument should be a GRanges or GPos object")
}
# check for accepted formats?: delayed matrix?, matrix, data.frame, data.table
# not sure if there might be other valid inputs so excluding this for now
# if using this code, change to is(object, class) format for if statements
# if (haveMCount) {
# if (!("" %in% class(methylCountDF))) {
# stop()
# }
# }
# if (haveCov) {
# if (!("" %in% class(coverageDF))) {
# stop()
# }
# }
# if (haveMProp) {
# if (!("" %in% class(methylPropDF))) {
# stop()
# }
# }
MIRAFormatBSDTList <- list() # to store output
# changing coordinates from GRanges object to data.table
coordinates <- grToDt(coordinates)
rowNum <- nrow(coordinates)
# setting sampleNum and doing checks on the inputs
if (haveMProp) {
sampleNum <- ncol(methylPropDF)
# making sure other inputs also have sampleNum columns
if (haveMCount) {
if (sampleNum != ncol(methylCountDF)) {
stop(cleanws("Input with the count of
methylated reads at each loci and input
with the proportion of methylation should have
the same number of columns."))
}
}
if (haveCov) {
if (sampleNum != ncol(coverageDF)) {
stop(cleanws("Input with the count of total reads at each loci
and input with the proportion of methylation
should have the same number of columns."))
}
}
} else {
sampleNum <- ncol(coverageDF)
# making sure other input has the same number of columns
if (sampleNum != ncol(methylCountDF)) {
stop(cleanws("Input with the count of total reads
and input with the count of methylated reads
at each loci
should have the same number of columns."))
}
}
# making sure number of rows are the same in all given inputs
if (haveMProp) {
if (rowNum != nrow(methylPropDF)) {
stop(cleanws("The number of rows for the genomic coordinates and
the number of rows for the proportion of methylation
for each loci should be the same."))
}
}
if (haveMCount) {
if (rowNum != nrow(methylCountDF)) {
stop(cleanws("The number of rows for the genomic coordinates and
the number of rows for the count of methylated reads
for each loci should be the same."))
}
}
if (haveCov) {
if (rowNum != nrow(coverageDF)) {
stop(cleanws("The number of rows for the genomic coordinates and
the number of rows for the count of total reads
for each loci should be the same."))
}
}
# adding sample names if not given as argument but present in other input
if (is.null(sample_names)) {
if (haveMProp) {
sample_names <- colnames(methylPropDF)
} else {
sample_names <- colnames(coverageDF)
}
}
if (!haveCov && !haveMCount) {
coverage <- rep(1, rowNum)
warning(cleanws("coverage was not included in input
so it was set to 1.
It is recommended to set the minBaseCovPerBin
argument of aggregateMethyl to 0."))
}
# looping through columns/samples to make 1 data.table per sample
for (i in 1:sampleNum) {
# each column is a different sample
# order of the below if statements is important
if (haveMCount) {
methylCount <- methylCountDF[, i]
}
if (haveMProp) {
methylProp <- methylPropDF[, i]
}
if (haveCov) {
coverage <- coverageDF[, i]
} else if (haveMCount) {
# assumed that if !haveCov, you haveMProp
coverage <- round(methylCount / methylProp, 3)
}
if (haveCov && !haveMCount) {
# theoretically this should be a whole number
methylCount <- round(methylProp * coverage, 0)
}
if (!haveCov && !haveMCount) {
# this will result in fractional methylCount values (from 0 to 1)
# but is necessary to be consistent with giving each loci a
# coverage of 1 which was done earlier in code
# for !haveCov && !haveMCount
# methylCount = methylProp * coverage = methylProp * 1
methylCount <- methylProp
}
if (!haveMProp) {
methylProp <- round(methylCount / coverage, 3)
}
# index for taking out rows with 0 coverage
notCovered <- which(coverage == 0)
# checking for NAs in methylProp
# they only should be present if methylPropDF was given
if (haveMProp) {
notCoveredNA <- which(is.na(methylProp))
notCovered <- sort(c(notCovered, notCoveredNA))
}
MIRAFormatBSDTList[[i]] <- data.table(chr = coordinates[, chr],
start = coordinates[, start],
methylCount = as.vector(methylCount),
coverage = as.vector(coverage),
methylProp = as.vector(methylProp)
)[!notCovered,] # filtering
setnames(MIRAFormatBSDTList[[i]],
c("chr", "start", "methylCount", "coverage", "methylProp"))
}
# add sample names to list (by reference)
if (!is.null(sample_names)) {
setattr(MIRAFormatBSDTList, "names", sample_names)
}
return(MIRAFormatBSDTList)
}
#' Convert a BSseq object to data.table format for MIRA.
#'
#' Converts a BSseq object to a list of data.tables with one data.table
#' per sample. A wrapper of the SummarizedExperimentToDataTable function.
#'
#' @param BSseqObj An object of class BSseq, can have smoothed or raw
#' methylation data.
#' @return MIRAFormatBSDTList A list of data.tables containing
#' the methylation data. One data.table per sample with the column
#' names: 'chr', 'start' (methylation coordinate), 'methylCount' (number of
#' methylated reads), 'coverage' (total number of reads), and
#' 'methylProp' (proportion of methylated reads). The order of the
#' list is the order of samples in the columns of the BSseq object.
#' If sample names are in the BSseq object, then a named list
#' will be returned.
#' @examples
#' data("exampleBSseqObj")
#' MIRAFormatBSDTList <- bsseqToDataTable(exampleBSseqObj)
#' @export
bsseqToDataTable <- function(BSseqObj) {
if (bsseq::hasBeenSmoothed(BSseqObj)) {
rawSmooth <- "smooth"
} else {
rawSmooth <- "raw"
}
# use accessor functions from bsseq to get needed data
MIRAFormatBSDTList <- SummarizedExperimentToDataTable(coordinates = bsseq::granges(BSseqObj),
methylCountDF = getCoverage(BSseq = BSseqObj, type = "M"),
coverageDF = getCoverage(BSseq = BSseqObj, type = "Cov"),
methylPropDF = getMeth(BSseq = BSseqObj, type = rawSmooth),
sample_names = bsseq::sampleNames(BSseqObj))
# # if only one sample, do not return list, just return data.table
# if (length(MIRAFormatBSDTList) == 1) {
# MIRAFormatBSDTList <- MIRAFormatBSDTList[[1]]
# }
return(MIRAFormatBSDTList)
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.