#' Import HOMER motifs.
#'
#' Import HOMER formatted motifs. See \url{http://homer.ucsd.edu/homer/motif/}.
#' Assumed to be DNA motifs. Note that HOMER motifs come with a pre-determined
#' logodds threshold; if you wish to re-create HOMER's motif scanning, then use
#' it in [scan_sequences()] (see examples).
#'
#' @return `list` [universalmotif-class] objects.
#'
#' @examples
#' data(ArabidopsisPromoters)
#' homer <- read_homer(system.file("extdata", "homer.txt",
#' package = "universalmotif"))
#' thresholds <- homer |> to_df() |> with(logodds.threshold) |> as.numeric()
#' scan_sequences(homer, ArabidopsisPromoters,
#' threshold = thresholds, threshold.type = "logodds.abs")
#'
#' @references
#'
#' Heinz S, Benner C, Spann N, Bertolino E, Lin YC, Laslo P, Cheng
#' JX, Murre C, Singh H, Glass CK (2010). “Simple combinations of
#' lineage-determining transcription factors prime cis-regulatory
#' elements required for macrophage and B cell identities.”
#' *Molecular Cell*, **38**, 576-589.
#'
#' @family read_motifs
#' @author Benjamin Jean-Marie Tremblay, \email{benjamin.tremblay@@uwaterloo.ca}
#' @inheritParams read_cisbp
#' @export
read_homer <- function(file, skip = 0) {
# TODO: better integration of the motif's own threshold with scan_sequences
# param check --------------------------------------------
args <- as.list(environment())
char_check <- check_fun_params(list(file = args$file),
1, FALSE, TYPE_CHAR)
num_check <- check_fun_params(list(skip = args$skip), 1, FALSE, TYPE_NUM)
all_checks <- c(char_check, num_check)
if (length(all_checks) > 0) stop(all_checks_collapse(all_checks))
#---------------------------------------------------------
raw_lines <- readLines(con <- file(file))
close(con)
if (skip > 0) raw_lines <- raw_lines[-seq_len(skip)]
raw_lines <- raw_lines[raw_lines != ""]
headers <- which(grepl("^>", raw_lines))
motif_starts <- headers + 1
if (length(headers) == 1) motif_stops <- length(raw_lines) else {
motif_stops <- c(headers[-1] - 1, length(raw_lines))
}
parse_motifs <- function(x, y) {
motif <- raw_lines[x:y]
motif <- vapply(motif,
function(x) as.numeric(strsplit(x, "\\s+")[[1]]),
numeric(4))
matrix(motif, ncol = 4, byrow = TRUE)
}
parse_meta <- function(x) {
x <- strsplit(x, "\\s+")[[1]]
if (grepl("/", x[2])) {
y <- strsplit(x[2], "/")[[1]]
y <- strsplit(y[1], "\\(")[[1]]
x[2] <- y[1]
family <- strsplit(y[2], "\\)")[[1]][1]
} else family <- character(0)
x2 <- strsplit(x[6], ",")[[1]]
nsites <- strsplit(strsplit(x2[1], "T:")[[1]][2], "\\(")[[1]][1]
bkgsites <- strsplit(strsplit(x2[2], "B:")[[1]][2], "\\(")[[1]][1]
pval <- strsplit(x2[3], "P:")[[1]][2]
c(name = x[2], nsites = nsites, bkgsites = bkgsites, pval = pval,
threshold = x[3], family = family)
}
motif_list <- mapply(parse_motifs, motif_starts, motif_stops,
SIMPLIFY = FALSE)
motif_meta <- lapply(raw_lines[headers], parse_meta)
homer2umot <- function(x, y) {
mot <- universalmotif_cpp(name = x[1],
nsites = ifelse(is.na(as.numeric(x[2])), numeric(0),
as.numeric(x[2])),
bkgsites = ifelse(is.na(as.numeric(x[3])), numeric(0),
as.numeric(x[3])),
motif = t(y),
alphabet = "DNA",
type = "PPM",
family = x[6],
pval = as.numeric(x["pval"]),
extrainfo = c(logodds = x["threshold"]))
validObject_universalmotif(mot)
mot
}
motifs <- mapply(homer2umot, motif_meta, motif_list,
SIMPLIFY = FALSE)
if (length(motifs) == 1) motifs <- motifs[[1]]
motifs
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.