#' Tidy manipulation of motifs.
#'
#' @return
#' For [to_df()]: a `data.frame` with the exposed slots as columns.
#'
#' For [update_motifs()]: the updated `data.frame`.
#'
#' For [requires_update()]: `TRUE` if the motifs are out of date,
#' `FALSE` if otherwise. Note that this function uses `identical()`
#' to check for this, which can be quite slow for large datasets. It
#' is usually just as fast to simply run [update_motifs()] in such
#' cases.
#'
#' For [to_list()]: a list of motifs.
#'
#' @details
#' To turn off the informative messages/warnings when printing the object to
#' the console, set `options(universalmotif_df.warning=FALSE)`.
#'
#' @examples
#' \dontrun{
#' library(universalmotif)
#' library(dplyr)
#'
#' m <- c(create_motif(name = "motif A"), create_motif(name = "motif B"))
#'
#' # Change the names of the motifs using the tidy way:
#' m <- m %>%
#' to_df() %>%
#' mutate(name = paste0(name, "-2")) %>%
#' to_list()
#'
#' # Add your own metadata to be stored in the extrainfo slot:
#' m_df <- to_df(m)
#' m_df$MyMetadata <- c("Info_1", "Info_2")
#' m <- to_list(m_df, extrainfo = TRUE)
#' }
#'
#' @author Benjamin Jean-Marie Tremblay, \email{benjamin.tremblay@@uwaterloo.ca}
#' @name tidy-motifs
NULL
#' @export
as.character.universalmotif <- function(x, maxchar = 12, ...) {
name <- x["name"]
if (nchar(name) > (maxchar - 6)) name <- paste0(substr(name, 1, maxchar - 8), "..")
paste0("<mot:", name, ">")
}
#' @export
print.universalmotif_df <- function(x, na.rm = TRUE, ...) {
y <- x
x_og <- x
if (na.rm) {
empty_cols <- vapply(x, function(x) all(is.na(x)), logical(1))
x <- x[, !empty_cols, drop = FALSE]
}
# For skinny terminals, this value will be larger than necessary, but I'm
# not sure how to check for this without easily breakable strsplit() stuff.
# The point of this is to avoid diff-ing dataframes with thousands of rows
# if only a fraction of those will be printed. Could probably use some
# cleaning up.
toprint <- length(capture.output(print(as.data.frame(x))))
if (toprint < nrow(x)) {
y <- x[seq_len(toprint), ]
}
checktry <- try(checkdf <- suppressWarnings(update_motifs(y)), silent = TRUE)
founderr <- FALSE
founddiff <- FALSE
if (inherits(checktry, "try-error")) {
founderr <- TRUE
} else {
diffd <- mapply(identical, as.list(y$motif), as.list(checkdf$motif))
if (any(!diffd)) {
x <- as.data.frame(cbind(" " = character(nrow(x)), x))
x[[1]][seq_along(diffd)] <- c("*", "")[diffd + 1]
founddiff <- TRUE
}
}
print.data.frame(x)
printNL <- FALSE
if (na.rm && any(empty_cols) && !isFALSE(getOption("universalmotif_df.warning"))) {
empty_cols <- names(empty_cols)[empty_cols]
empty_cols <- paste0(empty_cols, collapse = ", ")
cat("\n", wmsg("[Hidden empty columns: ", empty_cols, ".]"), sep = "")
printNL <- TRUE
}
if (founderr && !isFALSE(getOption("universalmotif_df.warning"))) {
cat("\n", wmsg("[Note: incomplete universalmotif_df object.]"), sep = "")
printNL <- TRUE
}
if (founddiff && !isFALSE(getOption("universalmotif_df.warning"))) {
cat("\n", wmsg("[Rows marked with * are changed. Run update_motifs()",
" or to_list() to apply changes.]"), sep = "")
printNL <- TRUE
}
if (printNL) cat("\n")
invisible(x_og)
}
#' @export
#' @param motifs List of motifs.
#' @rdname tidy-motifs
to_df <- function(motifs, extrainfo = TRUE) {
x <- convert_motifs(motifs)
if (!is.list(x)) x <- list(x)
y <- summarise_motifs_with_extras(x)
if (extrainfo) {
einfo <- extrainfo_to_df(x)
if (!is.null(einfo) && nrow(einfo))
y <- cbind(y, einfo)
}
# interesting side effect of using I(x): trying to use the motif column directly
# in functions such as view_motifs(df$motif) will throw an error.
# Maybe provide a method for "AsIs" in convert_motifs() which will throw a more
# informative error message? (Add extra class alongside "AsIs"?)
y <- cbind(motif = I(x), y)
# some functions may drop the extra class, but it should only affect pretty printing
structure(y, class = c("universalmotif_df", "data.frame"))
}
#' @export
#' @param motif_df Motif `data.frame` generated by [to_df()].
#' @param extrainfo Use the `extrainfo` slot in the tidy `data.frame`. The
#' column names will be taken from the character vectors themselves, and
#' unnamed elements will be assigned a unique name. To add elements to the
#' slot, simply create new columns in the `data.frame`. Note that these will
#' be coerced into characters. If `extrainfo` is not set to `TRUE` in
#' `to_df()`, then the contents of the slot will not be transferred to the
#' `data.frame`. If `extrainfo` is not set to `TRUE` in `update_motifs()`
#' or `to_list()`, then the extra columns will be discarded.
#' @param force Whether to coerce non-character data types into characters for
#' inclusion in `extrainfo`. If `force` is `FALSE` (the default), columns which
#' are not of type "character", "numeric", or "integer" (for example, list
#' columns, or logical values), will not be added to the motif `extrainfo`
#' slot, but will be passed onto the returned `universalmotif_df` unchanged.
#' Setting `force = TRUE` coerces these values into a character, adding them
#' to the `extrainfo` slot, and updating the `universalmotif_df` columns to
#' reflect this coercion. In other words, forcing inclusion of these data is
#' destructive and will change the column values. Use with caution.
#' @rdname tidy-motifs
update_motifs <- function(motif_df, extrainfo = TRUE, force = FALSE) {
# TODO: extrainfo implementation is very messy...
# TODO: come back and add alphabet change support once switch_alph() is updated
# TODO: performance
updated_df <- as.data.frame(motif_df)
if (!"motif" %in% colnames(updated_df))
stop("Could not find 'motif' column.")
m <- updated_df$motif
class(m) <- NULL
old_df <- as.data.frame(to_df(m, extrainfo = FALSE))
cols_new <- colnames(updated_df)
cols_new <- cols_new[cols_new != "motif"]
cols_old <- colnames(old_df)
cols_old <- cols_old[cols_old != "motif"]
holdout <- !force # makes it easier to reason about the code
extrainfo_holdout_cols <- NA_character_
if (extrainfo) {
# for now, just always update extrainfo...
cols_extrainfo <- cols_new[!cols_new %in% cols_old]
cols_extrainfo <- cols_extrainfo[cols_extrainfo != "bkg"]
if (length(cols_extrainfo)) {
# Keep id_cols temporarily to ensure sort order is OK
# TODO: in future use a better unique identifier
extrainfo_new <- updated_df[, cols_extrainfo, drop = FALSE]
if (holdout) {
# hold out unsupported datatypes
# Current "supported" types are "character", "numeric" and "integer"
extrainfo_classes <- vapply(extrainfo_new, class, character(1))
extrainfo_holdout_cols <- names(extrainfo_classes[!(extrainfo_classes %in% c("character", "numeric", "integer"))])
extrainfo_holdouts <- extrainfo_new[, extrainfo_holdout_cols, drop = FALSE]
# TODO: Consider a message for holdouts? I think it's unnecessary.
}
# if holding out columns, remove them from the extrainfo passed onto the motifs
if (holdout & length(extrainfo_holdout_cols) != 0) {
extrainfo_new <- extrainfo_new[, -which(names(extrainfo_new) %in% extrainfo_holdout_cols), drop = FALSE]
}
# Pass extrainfo to motif
for (i in seq_along(m)) {
m[[i]]["extrainfo"] <- clean_up_extrainfo_df(extrainfo_new[i, , drop = FALSE])
}
} else {
for (i in seq_along(m)) {
m[[i]]@extrainfo <- character()
}
}
} else if (any(!cols_new %in% cols_old)) {
# maybe remove this warning based on how extrainfo is implemented
message(wmsg(
"Discarding unknown slot(s) ",
paste0(paste0("'", cols_new[!cols_new %in% cols_old], "'"), collapse = ", "),
" (set `extrainfo=TRUE` to preserve these)."))
}
if (!extrainfo) {
for (i in seq_along(m)) {
m[[i]]@extrainfo <- character()
}
}
if (any(!cols_old %in% cols_new)) {
# hide this warning when called in to_list()?
message(wmsg(
"Restoring missing slot(s) ",
paste0(paste0("'", cols_old[!cols_old %in% cols_new], "'"), collapse = ", "),
"."))
}
cols_to_check <- cols_new[cols_new %in% cols_old]
if ("bkg" %in% cols_to_check) {
for (i in which(mapply(bkgs_are_different, old_df$bkg, updated_df$bkg, SIMPLIFY = TRUE))) {
m[[i]]["bkg"] <- updated_df$bkg[[i]]
}
cols_to_check <- cols_to_check[cols_to_check != "bkg"]
}
for (i in seq_along(cols_to_check)) {
checking <- cols_to_check[i]
# need a better way to deal with NAs...
# TODO: this causes 0s to be permanently introduced when altname is missing:
# dplyr::rename(mydf, name = altname, altname name)
updated_df[[checking]][is.na(updated_df[[checking]])] <- 0
old_df[[checking]][is.na(old_df[[checking]])] <- 0
if (any(updated_df[[checking]] != old_df[[checking]])) {
if (checking == "type") {
for (j in which(updated_df[[checking]] != old_df[[checking]])) {
m[[j]] <- convert_type(m[[j]], updated_df[[checking]][j])
}
} else if (checking %in% c("icscore", "consensus", "alphabet")) {
warning("Discarding changes in unmodifiable slot(s) '", checking, "'.",
immediate. = TRUE, call. = FALSE)
} else {
for (j in which(updated_df[[checking]] != old_df[[checking]])) {
msg <- try(m[[j]][checking] <- updated_df[[checking]][j], silent = TRUE)
if (inherits(msg, "try-error")) {
stop("Got the following error for motif in row ", j, ":\n", msg,
call. = FALSE)
}
}
}
}
}
if (holdout & extrainfo & all(!is.na(extrainfo_holdout_cols))){
# Add back any heldout info
# TODO: is `cbind` safe here? will row order always preserve?
new_df <- structure(cbind(to_df(m, extrainfo), extrainfo_holdouts),
class = c("universalmotif_df", "data.frame"))
return(new_df)
} else {
return(to_df(m, extrainfo))
}
}
#' @export
#' @rdname tidy-motifs
to_list <- function(motif_df, extrainfo = TRUE, force = FALSE) {
structure(update_motifs(motif_df, extrainfo, force)$motif, class = NULL)
}
#' @export
#' @rdname tidy-motifs
requires_update <- function(motifs, extrainfo = TRUE) {
# identical() is really quite slow...
motifs2 <- update_motifs(motifs, extrainfo)
any(!mapply(identical,
structure(motifs$motif, class = NULL),
structure(motifs2$motif, class = NULL)))
}
summarise_motifs_with_extras <- function(x) {
y <- summarise_motifs(x, na.rm = FALSE)
y$type <- vapply(x, function(x) x@type, character(1))
y$pseudocount <- vapply(x, function(x) x@pseudocount, numeric(1))
y$bkg <- lapply(x, function(x) x@bkg)
y
}
bkgs_are_different <- function(x, y) {
if (length(x) != length(y)) TRUE
else !all(x == y)
}
extrainfo_to_df <- function(x) {
y <- lapply(x, vec_to_df_mot)
cnames <- unique(unlist(lapply(y, colnames)))
for (i in seq_along(y)) {
if (!ncol(y[[i]])) {
y[[i]] <- vec_to_df(structure(rep(NA_character_, length(cnames)), names = cnames))
} else {
for (j in which(!cnames %in% colnames(y[[i]]))) {
y[[i]][[cnames[j]]] <- rep(NA_character_, nrow(y[[i]]))
}
y[[i]] <- y[[i]][, cnames, drop = FALSE]
}
}
do.call(rbind, y)
}
vec_to_df_mot <- function(x) {
vec_to_df(x@extrainfo)
}
vec_to_df <- function(x) {
if (!length(x)) data.frame()
else {
# TODO: need to mitigate risk of duplicate names here
if (is.null(names(x))) names(x) <- paste0("extrainfo.", seq_along(x))
names(x)[names(x) == ""] <- paste0("extrainfo.", seq_len(sum(names(x) == "")))
x <- list2DF(as.list(x))
x
}
}
clean_up_extrainfo_df <- function(x) {
y <- names(x)
x <- vapply(x, as.character, character(1))
x <- as.character(x)
if (all(is.na(x))) character()
else {
names(x) <- y
x[!is.na(x)]
}
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.