Nothing
##########################
# Template functionalities
#########################
#' Validates a Template Object.
#'
#' Checks whether a Templates object is valid or not.
#'
#' @param object An input data frame to be checked for being a template data frame.
#' @return \code{TRUE}, if the object is valid, \code{FALSE} otherwise.
#' @keywords internal
validate_templates <- function(object) {
# specify minimal set of columns that should be present in a template data frame:
errors <- NULL
required.fields <- list("ID" = "character", "Header" = "character",
"Identifier" = c("integer", "numeric"), "Sequence_Length" = c("integer", "numeric"),
"Group" = "character", "Allowed_Start_fw" = c("integer", "numeric"),
"Allowed_End_fw" = c("integer", "numeric"), "Allowed_Start_rev" = c("integer", "numeric"),
"Allowed_End_rev" = c("integer", "numeric"), "Allowed_fw" = "character", "Allowed_rev" = "character",
"Allowed_Start_fw_initial" = c("integer", "numeric"), "Allowed_End_fw_initial" = c("integer", "numeric"),
"Allowed_Start_rev_initial" = c("integer", "numeric"),
"Allowed_End_rev_initial" = c("integer", "numeric"), "Sequence" = "character",
"Run" = "character")
if (!is(object, "data.frame")) {
errors <- c(errors, "Input was no data frame.")
return(FALSE)
}
possible.fields <- NULL # don't check for additional fields here
check.fields <- check_setting(possible.fields, object, required.fields)
if (is.character(check.fields)) {
errors <- c(errors, check.fields)
}
if (length(errors) != 0) {
return(errors)
} else {
# ensure that 'Run' is unique
check.run <- length(unique(object$Run)) <= 1
if (check.run) {
return(TRUE)
} else {
msg <- "The 'Run' column may only contain a single unique value."
return(msg)
}
}
}
#' @rdname Input
#' @name Input
#' @aliases Templates-class
#' @details
#' In the following you can find a description
#' of the most important columns that can be found
#' in an object of class \code{Templates}. Note that
#' angle brackets in the column names
#' indicate the existence of multiple possibilities.
#' \describe{
#' \item{\code{ID}}{The identifiers of the templates.}
#' \item{\code{Identifier}}{The internal identifiers of the templates.}
#' \item{\code{Group}}{The identifiers of the groups that the templates belong to.}
#' \item{\code{Allowed_Start_<fw|rev>}}{The start of the interval in the templates
#' where binding is allowed for forward and reverse primers, respectively.}
#' \item{\code{Allowed_End_<fw|rev>}}{The end of the interval in the templates
#' where binding is allowed for forward and reverse primers, respectively.}
#' \item{\code{Allowed_<fw|rev>}}{The template sequence where binding
#' is allowed for forward and reverse primers, respectively.}
#' \item{\code{Run}}{An identifier for the set of template sequences.}
#' \item{\code{Covered_By_Primers}}{The identifiers of primers covering the templates,
#' when the template coverage has been annotated.}
#' \item{\code{primer_coverage}}{The number of primers covering the templates,
#' when the template coverage has been annotated.}
#' }
#' @return The \code{Templates} constructor returns
#' a \code{Templates} object, an instance of a data frame.
#' @exportClass Templates
#' @export
#' @examples
#'
#' # Load a set of templates:
#' fasta.file <- system.file("extdata", "IMGT_data", "templates",
#' "Homo_sapiens_IGH_functional_exon.fasta", package = "openPrimeR")
#' hdr.structure <- c("ACCESSION", "GROUP", "SPECIES", "FUNCTION")
#' template.df <- read_templates(fasta.file, hdr.structure, "|", "GROUP")
setClass("Templates",
contains = c("data.frame"), validity = validate_templates)
setMethod("initialize", "Templates",
function(.Object, ...) {
# just call data frame constructor
.Object <- callNextMethod(.Object, ...)
}
)
setMethod("show", "Templates", function(object) {
# overwrite the 'print' function
my_show_df(asS3(object))
})
#' @rdname Input
#' @name Input
#' @aliases Templates-class
#' @export
Templates <- function(...) new("Templates", ...)
#' cbind for Template class.
#'
#' Ensures that the cbind result has the appropriate class.
#'
#' @param ... Parameters for cbind function.
#' @return Column binded Templates data frame.
#' @keywords internal
#' @export
#' @examples
#' data(Ippolito)
#' template.df <- cbind(template.df, seq_len(nrow(template.df)))
cbind.Templates <- function(...) {
df <- cbind.data.frame(...)
df <- Templates(df)
return(df)
}
#' rbind for Template class.
#'
#' Ensures that the rbind result has the appropriate class.
#'
#' @param ... Parameters for rbind function.
#' @return Row-binded Templates data frame.
#' @keywords internal
#' @export
#' @examples
#' data(Ippolito)
#' template.df <- rbind(template.df, template.df)
rbind.Templates <- function(...) {
df <- rbind.data.frame(...)
df <- Templates(df)
return(df)
}
#' S4 cbind for Templates.
#'
#' S4 cbind function for Templates data frame.
#'
#' @export
#' @return Cbinded template data frame.
#' @rdname Templates-method
#' @param x The Templates data frame.
#' @param y Another data frame.
#' @keywords internal
#' @examples
#' data(Ippolito)
#' template.df <- cbind2(template.df, seq_len(nrow(template.df)))
setMethod("cbind2", "Templates",
# need this in case of S3 dispatch fails.
function(x, y, ...) {
df <- cbind.data.frame(x, y, ...)
df <- Templates(df)
return(df)
}
)
#' S4 rbind for Templates.
#'
#' S4 rbind function for templates.
#'
#' @export
#' @rdname Templates-method
#' @return Rbinded template data frame.
#' @keywords internal
#' @examples
#' data(Ippolito)
#' template.df <- rbind2(template.df, template.df)
setMethod("rbind2", "Templates",
# need this in case of S3 dispatch fails.
function(x, y, ...) {
df <- rbind.data.frame(x, y, ...)
df <- Templates(df)
return(df)
}
)
#' Slicing Operator for Templates.
#'
#' Slicing of Templates data frame object.
#'
#' @param x The Template data frame.
#' @param i The row index.
#' @param j The column index.
#' @param ... Other arguments to the slice operator.
#' @param drop Simplify data frame?
#' @exportMethod [
#' @rdname Templates-method
#' @keywords internal
#' @aliases [,Templates-method [,Templates,ANY-method
#' @return Subsetted template data frame.
#' @examples
#' data(Ippolito)
#' template.df <- template.df[1:2,]
setMethod("[", c("Templates", "ANY", "ANY"),
function(x, i, j, ..., drop = TRUE) {
if (missing(drop)) {
df <- asS3(x)[i, j, ...]
} else {
df <- asS3(x)[i, j, ..., drop = drop]
}
if (is(df, "data.frame")) {
# only set my class if we haven't simplified.
#options("show.error.messages" = FALSE)
t.df <- suppressWarnings(try(Templates(df), silent = TRUE))
#options("show.error.messages" = TRUE)
if (is(t.df, "try-error")) {
# we removed some crucial columns -> No Templates object anymore.
t.df <- asS3(df)
}
df <- t.df
}
return(df)
}
)
#' Dollar Operator for Templates.
#'
#' Set a column in a Templates data frame.
#'
#' @exportMethod $<-
#' @rdname Templates-method
#' @param name The name of the column.
#' @param value The values of the column.
#' @return Templates with replaced column.
#' @keywords internal
#' @examples
#' data(Ippolito)
#' template.df$ID[1] <- "newID"
setMethod("$<-", "Templates",
function(x, name, value) {
df <- asS3(x)
eval(parse(text = paste("df$", name, " <- value", sep = "")))
df <- Templates(df)
return(df)
}
)
#' Removal of Partial Sequences.
#'
#' Removes partial template sequences.
#'
#' @param template.df Template data frame.
#' @param Header keywords indiating templates that should be excluded.
#' @return Template data frame with partial sequences removed.
#' @keywords internal
remove.seqs.by.keyword <- function(template.df, keyword = "partial") {
if (length(keyword) == 0) {
# nothing to remove
return(template.df)
}
rm.idx <- unique(unlist(lapply(keyword, function(x) grep(x, template.df$Header))))
if (length(rm.idx) != 0) {
template.df <- template.df[-rm.idx, ]
}
return(template.df)
}
#' Parse FASTA headers
#'
#' Parses the headers of a FASTA file.
#'
#' @param hdr The headers (> entries) of a FASTA File.
#' @param delim The delimiter used to separate distinct fields in the headers.
#' For example, \code{|} for headers such as > E.coli | GeneX | ...
#' @param hdr.str Names of the fields appearing in the header.
#' @param id.column Field in the header to be used used as an identifier for the sequences.
#' @return Data frame with structured information from the headers.
#' @keywords internal
parse.header <- function(hdr, delim, hdr.str, id.column) {
result <- data.frame(Header = hdr, stringsAsFactors = FALSE)
data.df <- NULL
if (length(hdr.str) != 0 && delim != "") {
s <- strsplit(gsub(">", "", hdr), split = delim, fixed = TRUE)
data.df <- data.frame(matrix(rep(NA, length(hdr.str) * length(hdr)), ncol = length(hdr.str)),
stringsAsFactors = FALSE)
colnames(data.df) <- hdr.str
for (i in seq_along(s)) {
h <- s[[i]]
if (length(hdr.str) > length(h)) {
my.error("TemplateHeaderStructure", "Header is too short for given header structure!")
}
elements <- h[seq_along(hdr.str)]
data.df[i, ] <- elements
}
} else if (length(hdr.str) != 0 && delim == "") {
# no delimiter or header structure given -> use the complete header info for the first specified field
s <- gsub(">", "", hdr)
data.df <- data.frame(Info = s, stringsAsFactors = FALSE)
colnames(data.df) <- hdr.str[1]
}
if (length(data.df) != 0) {
result <- cbind(result, data.df)
}
if (!id.column %in% colnames(result)) {
my.warning("TemplateIDColNotFound", paste("ID column: ", id.column, " was not found in the header of the template. Using first available header variable as ID.",
sep = ""))
result <- cbind(ID = result[, 1], result, stringsAsFactors = FALSE)
} else {
result <- cbind(ID = result[, id.column], result, stringsAsFactors = FALSE)
}
# create an identifier to match to leader sequences: should be unique and be
# present in leader file
if (!"Group" %in% colnames(result)) {
result$Group <- "default"
}
return(result)
}
#' @rdname Input
#' @name Input
#' @aliases read_templates
#' @details
#' When loading a FASTA file with \code{read_templates},
#' the input arguments \code{hdr.structure}, \code{delim}, \code{id.column},
#' \code{rm.keywords}, \code{remove.duplicates}, \code{fw.region},
#' \code{rev.region}, \code{gap.character}, and \code{run} are utilized.
#' Most importantly, \code{hdr.structure} and \code{delim} should
#' match the FASTA header structure.
#' To learn more about setting the primer binding regions, consider the
#' \code{\link{assign_binding_regions}} function.
#' In contrast, when a CSV file is loaded with \code{read_templates},
#' the data are loaded without performing any modifications because the CSV file
#' should represent an object of class \code{\link{Templates}}, which
#' can be stored using the \code{\link{write_templates}} function.
#'
#' @return \code{read_templates} returns a single object of class
#' \code{Templates} if a single filename was provided or a list
#' of such objects if multiple file names were provided.
#' @export
#' @examples
#' # Load templates from a FASTA file
#' fasta.file <- system.file("extdata", "IMGT_data", "templates",
#' "Homo_sapiens_IGH_functional_exon.fasta", package = "openPrimeR")
#' hdr.structure <- c("ACCESSION", "GROUP", "SPECIES", "FUNCTION")
#' template.df.fasta <- read_templates(fasta.file, hdr.structure, "|", "GROUP")
#' # Load mutliple FASTA files
#' fasta.files <- c(fasta.file, fasta.file)
#' template.df.fastas <- read_templates(fasta.files, hdr.structure, "|", "GROUP")
#' # Load templates from a previously stored CSV file
#' csv.file <- system.file("extdata", "IMGT_data", "comparison",
#' "templates", "IGH_templates.csv", package = "openPrimeR")
#' template.df.csv <- read_templates(csv.file)
#' # Load multiple CSV files:
#' csv.files <- c(csv.file, csv.file)
#' template.df.csvs <- read_templates(csv.files)
#' # Load a mixture of FASTA/CSV files:
#' mixed.files <- c(csv.file, fasta.file)
#' template.data <- read_templates(mixed.files)
read_templates <- function(fname, hdr.structure = NULL, delim = NULL, id.column = NULL,
rm.keywords = NULL, remove.duplicates = FALSE, fw.region = c(1,30),
rev.region = c(1,30), gap.char = "-", run = NULL) {
if (length(fname) > 1) {
template.df <- read_templates_multiple(fname,
hdr.structure = hdr.structure, delim = delim,
id.column = id.column, rm.keywords = rm.keywords,
remove.duplicates = remove.duplicates,
fw.region = fw.region, rev.region = rev.region,
gap.character = gap.char, run = run)
} else {
template.df <- read_templates_single(fname,
hdr.structure = hdr.structure, delim = delim,
id.column = id.column, rm.keywords = rm.keywords,
remove.duplicates = remove.duplicates,
fw.region = fw.region, rev.region = rev.region,
gap.character = gap.char, run = run)
}
return(template.df)
}
#' Read Template CSV File
#'
#' Reads an input template CSV file.
#'
#' @param fname The filename of the input template CSV file.
#'
#' @return A template data frame.
#' @keywords internal
read_templates_csv <- function(fname) {
if (!file.exists(fname)) {
stop("Templates CSV file at '", fname, "' could not be found.")
}
templates <- try(read.csv(fname, stringsAsFactors = FALSE, row.names = NULL), silent = TRUE)
if (is(templates, "try-error")) {
my.error("TemplateFormatIncorrect",
paste0("Could not read the input file: ",
" '", fname, "' as a CSV. Please check your input!"))
}
char.cols <- c("Allowed_fw", "Allowed_rev",
"Covered_By_Primers", "Covered_By_Primers_fw",
"Covered_By_Primers_rev")
num.cols <- c("Allowed_Start_fw","Allowed_Start_rev",
"Allowed_End_fw", "Allowed_End_rev",
"Allowed_Start_fw_initial", "Allowed_Start_rev_initial",
"Allowed_End_fw_initial", "Allowed_End_rev_initial",
"primer_coverage", "primer_coverage_fw",
"primer_coverage_rev")
c.cols <- char.cols[char.cols %in% colnames(templates)]
templates[, c.cols] <- apply(templates[, c.cols], 2, as.character)
n.cols <- num.cols[num.cols %in% colnames(templates)]
templates[, n.cols] <- apply(templates[,n.cols], 2, as.numeric)
# set NA to empty string for character columns since "" is converted to NA by write.csv()
templates[, c.cols][is.na(templates[, c.cols])] <- c("")
templates <- try(Templates(templates))
if (is(templates, "try-error")) {
my.error("TemplateFormatIncorrect",
paste0("Could not construct a 'Templates' object from the",
" CSV input file '", fname, "'. Please check that",
" the structure of the input file is correct."), silent = TRUE)
}
return(templates)
}
#' Input of a Single Template File.
#'
#' Read template sequences from a FASTA or CSV file.
#'
#' When supplying a FASTA file with template sequences,
#' the input arguments \code{hdr.structure}, \code{delim}, \code{id.column},
#' \code{rm.keywords}, \code{remove.duplicates}, \code{fw.region},
#' \code{rev.region}, \code{gap.character}, and \code{run} are utilized.
#' Most importantly, \code{hdr.structure} and \code{delim} should
#' match the FASTA header structure.
#' When supplying a CSV file with template sequences,
#' the data are loaded without any modification and the CSV file
#' should represent an object of class \code{\link{Templates}}, which
#' can be stored using the \code{\link{write_templates}} function.
#'
#' @param template.file Path to a FASTA or CSV file containing the template sequences.
#' @param hdr.structure A character vector describing the information contained in the FASTA headers. In case that the headers of \code{fasta.file} contain
#' template group information, please include the keyword "GROUP" in
#' \code{hdr.structure}.
#' @param delim Delimiter for the information in the FASTA headers.
#' @param id.column Field in the header to be used as the identifier.
#' @param rm.keywords A vector of keywords that are used to remove templates whose headers contain any of the keywords.
#' @param remove.duplicates Whether duplicate sequence shall be removed.
#' @param fw.region The positional interval from the template 5' end specifying the
#' binding sites for forward primers.
#' @param rev.region The positional interval from the template 3' end specifying
#' the binding sites for reverse primers.
#' @param gap.character The character in the input file representing gaps.
#' Gaps are automatically removed upon input.
#' @param run An identifier for the template sequences.
#' @return An object of class \code{Templates}.
#' @keywords internal
read_templates_single <- function(template.file, hdr.structure = NULL, delim = NULL, id.column = NULL,
rm.keywords = NULL, remove.duplicates = FALSE, fw.region = c(1,30),
rev.region = c(1,30), gap.character = "-", run = NULL) {
template.df <- try(read_templates_fasta(template.file, hdr.structure,
delim, id.column, rm.keywords, remove.duplicates,
fw.region, rev.region, gap.character, run), silent = TRUE)
if (is(template.df, "try-error")) {
template.df <- try(read_templates_csv(template.file), silent = TRUE)
if (is(template.df, "try-error")) {
my.error("TemplateFormatIncorrect",
paste0("Unsupported template input file type or error reading data for file: '", template.file, "'"))
}
}
# check template quality: degeneration
degen <- score_degen(strsplit(template.df$Sequence, split = ""), gap.char = gap.character)
degen.idx <- which(degen > 2^10)
if (length(degen.idx) > 0) {
msg <- paste("At least one template sequence was highly degenerate.",
"Please check the quality of the input template sequences.",
"The following sequences will not be disambiguated:",
paste(template.df$ID[degen.idx], collapse = ","))
my.warning("TemplateDegeneration", msg)
}
# ensure that IDs of templates are unique
dup.idx <- which(duplicated(template.df$ID))
if (length(dup.idx) != 0) {
# make IDs unique
non.unique.IDs <- unique(template.df$ID[dup.idx])
for (ID in non.unique.IDs) {
idx <- which(template.df$ID == ID)
new.IDs <- paste0(ID, "_", seq_along(idx))
template.df$ID[idx] <- new.IDs
}
}
return(template.df)
}
####
#' Input of Multiple Templates.
#'
#' Reads multiple template CSV/FASTA files.
#'
#' @param filenames Names of FASTA/CSV files containing template data.
#' @param hdr.structure A character vector describing the information contained in the FASTA headers. In case that the headers of \code{fasta.file} contain
#' template group information, please include the keyword "GROUP" in
#' \code{hdr.structure}.
#' @param delim Delimiter for the information in the FASTA headers.
#' @param id.column Field in the header to be used as the identifier.
#' @param rm.keywords A vector of keywords that are used to remove templates whose headers contain any of the keywords.
#' @param remove.duplicates Whether duplicate sequence shall be removed.
#' @param fw.region The positional interval from the template 5' end specifying the
#' binding sites for forward primers.
#' @param rev.region The positional interval from the template 3' end specifying
#' the binding sites for reverse primers.
#' @param gap.character The character in the input file representing gaps.
#' Gaps are automatically removed upon input.
#' @param run An identifier for the template sequences.
#' @return A list containing objects of class \code{Templates}.
#' @keywords internal
read_templates_multiple <- function(filenames, hdr.structure = NULL, delim = NULL, id.column = NULL,
rm.keywords = NULL, remove.duplicates = FALSE, fw.region = c(1,30),
rev.region = c(1,30), gap.character = "-", run = NULL) {
data <- lapply(filenames, function(x) read_templates_single(x,
hdr.structure = hdr.structure, delim = delim,
id.column = id.column, rm.keywords = rm.keywords,
remove.duplicates = remove.duplicates,
fw.region = fw.region, rev.region = rev.region,
gap.character = gap.character, run = run))
data <- set.run.names(data, filenames)
return(data)
}
#' Input of Template Sequences.
#'
#' Read template sequences from a FASTA file.
#'
#' @param fasta.file Path to a FASTA file containing the template sequences.
#' @param hdr.structure Names describing the information contained in the FASTA headers. In case that the headers of \code{fasta.file} contain
#' template group information, please include the keyword "GROUP" in
#' \code{hdr.structure}.
#' @param delim Delimiter for the information in the FASTA headers.
#' @param id.column Field in the header to be used as the identifier.
#' @param rm.keywords A vector of keywords that are used to remove templates whose headers contain any of the keywords.
#' @param remove.duplicates Whether duplicate sequence shall be removed.
#' @param fw.region The positional interval from the template 5' end specifying the
#' binding sites for forward primers.
#' @param rev.region The positional interval from the template 3' end specifying
#' the binding sites for reverse primers.
#' @param gap.character The character in the input file representing gaps.
#' Gaps are automatically removed upon input.
#' @param run An identifier for the template sequences.
#' @return An object of class \code{Templates}.
#' @keywords internal
#' @examples
#' fasta.file <- system.file("extdata", "IMGT_data", "templates",
#' "Homo_sapiens_IGH_functional_exon.fasta", package = "openPrimeR")
#' hdr.structure <- c("ACCESSION", "GROUP", "SPECIES", "FUNCTION")
#' template.df <- read_templates(fasta.file, hdr.structure, "|", "GROUP")
read_templates_fasta <- function(fasta.file, hdr.structure = NULL, delim = NULL, id.column = NULL,
rm.keywords = NULL, remove.duplicates = FALSE, fw.region = c(1,30),
rev.region = c(1,30), gap.character = "-", run = NULL) {
allowed.nts <- c(tolower(names(IUPAC_CODE_MAP)), gap.character)
seqs <- my.read.fasta(fasta.file, allowed.nts)
if (is.null(seqs)) {
return(NULL)
}
if (is.null(run)) {
# use filename w/o extension as identifier
run <- sub("^([^.]*).*", "\\1", basename(fasta.file))
}
if (length(id.column) != 0) {
if (length(id.column) != 1) {
stop("Please specify only a single id.column.")
}
id.col.ok <- any(grepl(id.column[1], hdr.structure))
if (!id.col.ok) {
my.error("ID_Column_Not_Found", paste0("The specified",
"ID column was not part of the header structure."))
}
} else {
# use the first hdr.structure field as the ID if not id.col is specified
id.column <- hdr.structure[1]
}
if (length(hdr.structure) != 0 && length(delim) == 0) {
warning("Header structure provided, but no delimiter. Cannot read meta information!")
}
if (length(hdr.structure) == 0 && length(delim) != 0) {
warning("Header delimiter supplied, but no header structure. Cannot read meta information!")
}
if (length(delim) == 0) {
delim <- ""
}
if (length(hdr.structure) == 0) {
# no hdr.structure supplied
id.column <- "HEADER" # use the full header as the ID column
}
# retrieve seqs/replace gap char
s <- sapply(seqs, function(x) paste(x, collapse = ""))
hdr <- sapply(seqs, function(x) attr(x, "Annot"))
# determine IDs and groups parse hdr structure
hdr.str <- paste(substr(hdr.structure, 1, 1),
substr(tolower(hdr.structure), 2, nchar(hdr.structure)),
sep = "")
id.column <- paste(substr(id.column, 1, 1), substr(tolower(id.column), 2, nchar(id.column)),
sep = "") # rename according to renaming in parse.hdr.structure
ids <- parse.header(hdr, delim, hdr.str, id.column) # data frame from parsed header
if (length(ids) == 0) {
return(NULL)
}
d <- data.frame(ids, Identifier = seq_along(s), InputSequence = s, Sequence_Length = nchar(s),
stringsAsFactors = FALSE)
rownames(d) <- NULL
# annotate gene groups: try to parse IMGT header structure
gene.groups <- try(parse.IMGT.gene.groups(d$Group))
if (is(gene.groups[1], "try-error")) {
gene.groups <- NULL
}
if (!is.null(gene.groups)) {
# IMGT header structure -> get some more info
groups <- gene.groups$Gene_Group
if (!all(is.na(groups))) {
d$Group <- groups
}
}
# exclude partial seqs
if (length(rm.keywords) != 0) {
d <- remove.seqs.by.keyword(d, rm.keywords)
}
# exclude duplicates
if (remove.duplicates) {
idx <- which(duplicated(d$InputSequence))
if (length(idx) != 0) {
d <- d[-idx,]
}
}
# add default binding regions:
d$Sequence <- ungap_sequence(d$InputSequence, gap.character)
default.leaders <- create.uniform.leaders(fw.region, rev.region, d, gap.character)
d <- cbind(d, default.leaders)
d$Run <- run
# re-order:
last.cols <- c("Sequence", "InputSequence", "Run")
cols <- setdiff(colnames(d), last.cols)
cols <- c(cols, last.cols)
d <- d[, cols]
template.df <- Templates(d)
return(template.df)
}
#' Read Sequences.
#'
#' Reads an input FASTA file.
#'
#' @param fasta.file The path to a FASTA file.
#' @param The character indicating gaps in sequences.
#' @return A data frame with sequences.
#' @keywords internal
read.sequences <- function(fasta.file, gap.character) {
if (length(fasta.file) == 0) {
return(NULL)
}
# supress warning when there's an extra-newline tryCatch(
allowed.nts <- c(tolower(names(IUPAC_CODE_MAP)), gap.character)
seqs <- my.read.fasta(fasta.file, allowed.nts)
s <- sapply(seqs, function(x) paste(x, collapse = ""))
hdr <- sapply(seqs, function(x) attr(x, "Annot"))
d <- data.frame(Identifier = seq_along(s), Header = hdr, Sequence = s, Sequence_Length = nchar(s),
stringsAsFactors = FALSE)
rownames(d) <- NULL
return(d)
}
#' @rdname TemplatesFunctions
#' @name TemplatesFunctions
#' @aliases update_template_cvg
#' @return \code{update_template_cvg} returns an object of class
#' \code{Templates} with updated coverage columns.
#' @export
#' @examples
#'
#' # Annotate the coverage of the templates
#' data(Ippolito)
#' template.df <- update_template_cvg(template.df, primer.df)
update_template_cvg <- function(template.df, primer.df,
mode.directionality = NULL) {
if (length(template.df) == 0 || length(primer.df) == 0) {
# nothing to update ...
return(NULL)
}
if (!is(primer.df, "Primers")) {
stop("Please supply a valid primer data frame.")
}
if (!is(template.df, "Templates")) {
stop("Please provide a valid template data frame.")
}
# check whether primers and templates correspond to one another
if (!check_correspondence(primer.df, template.df)) {
msg <- paste("Could not match all coverage events to the input template data frame.",
"Please check whether the primers and templates correspond.")
stop(msg)
}
if (is.null(mode.directionality)) {
mode.directionality <- get.analysis.mode(primer.df)
} else {
mode.directionality <- match.arg(mode.directionality, c("fw", "rev", "both"))
}
cvg.matrix <- get.coverage.matrix(primer.df, template.df) # template x primers: 1/0
if (length(cvg.matrix) != 0) {
# require any primer to cover a template
# for direction "both": covered seqs are fw AND rev primers
idx <- lapply(seq_len(nrow(cvg.matrix)), function(x)
which(cvg.matrix[x,] != 0)) # idx of covering primers per template
} else {
idx <- NULL
}
# add columns:
cols <- c("Covered_By_Primers")
extra.cols <- c(paste(cols, "_fw", sep = ""), paste(cols, "_rev", sep = ""))
all.cols <- c(cols, extra.cols)
for (i in seq_along(all.cols)) {
template.df[, all.cols[i]] <- rep("", nrow(template.df))
}
cols <- c("primer_coverage")
extra.cols <- c(paste(cols, "_fw", sep = ""), paste(cols, "_rev", sep = ""))
all.cols <- c(cols, extra.cols)
for (i in seq_along(all.cols)) {
template.df[, all.cols[i]] <- rep(0, nrow(template.df))
}
if (length(idx) != 0) {
fw.primer.idx <- which(primer.df$Forward != "")
rev.primer.idx <- which(primer.df$Reverse != "")
if (mode.directionality == "both") { # require fw & rev binding events
# coverage by both a foward and a reverse primer?
both.idx <- sapply(idx, function(x) any(x %in% fw.primer.idx) & any(x %in% rev.primer.idx))
primer.IDs <- lapply(seq_along(idx), function(x) if(both.idx[x]) primer.df$Identifier[idx[[x]]] else numeric(0))
} else {
primer.IDs <- lapply(idx, function(x) primer.df$Identifier[x])
}
template.df$Covered_By_Primers <- sapply(primer.IDs, function(x) paste(x, collapse = ","))
# n.b.: for primers with two directions, template cvg is total nbr of fw/rev primers!
template.df$primer_coverage <- sapply(primer.IDs, length)
# individual direction coverage:
new.entries <- data.frame(
Covered_By_Primers_fw = "",
primer_coverage_fw = 0,
Covered_By_Primers_rev = "",
primer_coverage_rev = 0, stringsAsFactors = FALSE)
col.idx <- which(!colnames(new.entries) %in% colnames(template.df))
if (length(col.idx) != 0) {
# add missing fields
template.df <- cbind(template.df, new.entries[,col.idx])
}
if (mode.directionality == "fw" || mode.directionality == "both") {
idx.fw <- lapply(idx, function(x) intersect(x, fw.primer.idx))
primer.IDs.fw <- lapply(idx.fw, function(x) primer.df$Identifier[x])
template.df$Covered_By_Primers_fw <- sapply(primer.IDs.fw, function(x) paste(x,
collapse = ","))
template.df$primer_coverage_fw <- sapply(primer.IDs.fw, length)
}
if (mode.directionality == "rev" || mode.directionality == "both") {
idx.rev <- lapply(idx, function(x) intersect(x, rev.primer.idx))
primer.IDs.rev <- lapply(idx.rev, function(x) primer.df$Identifier[x])
template.df$Covered_By_Primers_rev <- sapply(primer.IDs.rev, function(x) paste(x,
collapse = ","))
template.df$primer_coverage_rev <- sapply(primer.IDs.rev, length)
}
}
return(template.df)
}
#' @rdname TemplatesFunctions
#' @name TemplatesFunctions
#' @aliases adjust_binding_regions
#' @details
#' When modifying binding regions with \code{adjust_binding_regions}, new
#' binding intervals can be specified via \code{fw} and \code{rev}
#' for forward and reverse primers, respectively. The new regions should
#' be provided relative to the existing definition of binding regions
#' in \code{template.df}.
#' For specifying the new binding regions, position \code{0} refers to
#' the first position after the end of the existing
#' binding region. Hence, negative positions relate to regions within the
#' existing binding region, while non-negative values relate to positions
#' outside the defined binding region.
#'
#' @return \code{adjust_binding_regions} returns a \code{Templates} object with
#' updated binding regions.
#' @export
#' @examples
#' data(Ippolito)
#' # Extend the binding region by one position
#' relative.interval <- c(-max(template.df$Allowed_End_fw), 0)
#' template.df.adj <- adjust_binding_regions(template.df, relative.interval)
#' # compare old and new annotations:
#' head(cbind(template.df$Allowed_Start_fw, template.df$Allowed_End_fw))
#' head(cbind(template.df.adj$Allowed_Start_fw, template.df.adj$Allowed_End_fw))
adjust_binding_regions <- function(template.df, region.fw, region.rev) {
if (!is(template.df, "Templates")) {
stop("'template.df' should be a 'Templates' object.")
}
if (missing(region.fw)) {
region.fw <- NULL
}
if (missing(region.rev)) {
region.rev <- NULL
}
cols <- c("Allowed_Start_fw", "Allowed_End_fw", "Allowed_Start_rev", "Allowed_End_rev")
if (length(region.fw) != 0) {
if (length(region.fw) != 2) {
stop("'region.fw' wasn't an interval.")
}
min <- region.fw[1]
max <- region.fw[2]
template.df <- update.individual.binding.region(min, max, template.df, "fw")
}
if (length(region.rev) != 0) {
if (length(region.rev) != 2) {
stop("'region.rev' wasn't an interval.")
}
min <- region.rev[1]
max <- region.rev[2]
template.df <- update.individual.binding.region(min, max, template.df, "rev")
}
return(template.df)
}
#' Adjustment of Existing Binding Regions for one Direction.
#'
#' Adjusts the existing annotation of binding regions by specifying
#' an interval relative to the existing binding region.
#'
#' Position \code{0} indicates the first position after the existing
#' binding region. Hence, negative positions adjust the binding region towards
#' the existing binding regions and non-negative positionis extend
#' the existing binding region definition aways from the existing target region.
#'
#' @param min Position where binding should start.
#' @param max End position of binding.
#' @param Template data frame.
#' @param mode.directionality Directionality of primers.
#' @return Template data frame with updated binding regions.
#' @keywords internal
update.individual.binding.region <- function(min, max, template.df, mode.directionality) {
# min/max: new setting for binding position relative to target region
allowed.start.initial <- paste("Allowed_Start_", mode.directionality, "_initial",
sep = "")
allowed.start <- paste("Allowed_Start_", mode.directionality, sep = "")
allowed.end.initial <- paste("Allowed_End_", mode.directionality, "_initial",
sep = "")
allowed.end <- paste("Allowed_End_", mode.directionality, sep = "")
allowed.region <- paste("Allowed_", mode.directionality, sep = "")
if (mode.directionality == "fw") {
target.pos <- template.df[, allowed.end.initial] + 1 # absolute target position
} else {
target.pos <- template.df[, allowed.start.initial] - 1
}
if (length(min) != 0 && length(max) != 0) {
# start
if (mode.directionality == "fw") {
new.start <- sapply(target.pos + min, function(x) max(x, 1))
sel <- new.start >= nchar(template.df$Sequence)
new.start[sel] <- nchar(template.df$Sequence)[sel]
template.df[, allowed.start] <- new.start
} else {
new.end <- sapply(seq_len(nrow(template.df)), function(x) min(target.pos[x] -
min, nchar(template.df$Sequence[x])))
new.end[new.end <= 0] <- 1
template.df[, allowed.end] <- new.end
}
# end
if (mode.directionality == "fw") {
new.end <- sapply(seq_len(nrow(template.df)), function(x) min(target.pos[x] +
max, nchar(template.df$Sequence[x])))
new.end[new.end <= 0] <- 1
template.df[, allowed.end] <- new.end
} else {
new.start <- sapply(target.pos - max, function(x) max(x, 1))
template.df[, allowed.start] <- new.start
}
# update allowed region
template.df[, allowed.region] <- substring(template.df$Sequence, template.df[,
allowed.start], template.df[, allowed.end])
}
return(template.df)
}
#' @rdname TemplatesFunctions
#' @name TemplatesFunctions
#' @aliases assign_binding_regions
#' @details
#' Binding regions are defined using \code{assign_binding_regions}, where
#' the arguments \code{fw} and \code{rev} provide data describing
#' the binding regions of the forward and reverse primers, respectively.
#' To specify binding regions for each template
#' individually, \code{fw} and \code{rev} should provide the paths to FASTA files. The headers of these
#' FASTA file should match the headers of the loaded \code{template.df}
#' and the sequences in the files specified by \code{fw} and \code{rev} should indicate
#' the target binding regions.
#'
#' To specify uniform binding regions,
#' \code{fw} and \code{rev} should be numeric intervals indicating the allowed
#' binding range for primers in the templates. Setting the forward interval
#' to (1,30) indicates that the first 30 bases should be used for forward primers
#' and specifying the reverse interval to (1,30) indicates that the last
#' 30 bases should be used for reverse primer binding.
#'
#' If \code{optimize.region} is \code{TRUE}, the input binding region is
#' adjusted such that regions forming secondary structures are avoided.
#'
#' @return \code{assign_binding_regions} returns an object of class \code{Templates} with newly assigned binding regions.
#' @export
#' @note \code{assign_binding_regions} requires the program ViennaRNA
#' (https://www.tbi.univie.ac.at/RNA/)
#' for adjusting the binding regions when \code{optimize.region}
#' is set to \code{TRUE}.
#' @examples
#' data(Ippolito)
#' # Assignment of individual binding regions
#' l.fasta.file <- system.file("extdata", "IMGT_data", "templates",
#' "Homo_sapiens_IGH_functional_leader.fasta", package = "openPrimeR")
#' template.df.individual <- assign_binding_regions(template.df, l.fasta.file, NULL)
#' # Assign the first/last 30 bases as forward/reverse binding regions
#' template.df.uniform <- assign_binding_regions(template.df, c(1,30), c(1,30))
#' # Optimization of binding regions (requires ViennaRNA)
#' \dontrun{template.df.opti <- assign_binding_regions(template.df, c(1,30), c(1,30),
#' optimize.region = TRUE, primer.length = 20)}
assign_binding_regions <- function(template.df, fw = NULL, rev = NULL, optimize.region = FALSE,
primer.length = 20, gap.char = "-") {
if (is.null(fw) && !is.null(rev)) {
fw <- eval(parse(text = paste(class(rev), "(0)", sep = "")))
} else if (is.null(rev) && !is.null(fw)) {
rev <- eval(parse(text = paste(class(fw), "(0)", sep = "")))
} else if (is.null(fw) && is.null(rev)) {
warning("'fw' and 'rev' were both NULL. No binding regions were annotated.")
return(template.df)
}
if (!is(template.df, "Templates")) {
stop("Please supply a valid template data frame.")
}
if (optimize.region && length(primer.length) != 1) {
stop("Please specify a numeric, scalar primer length.")
}
UseMethod("assign_binding_regions", fw)
}
#' Numeric Assignment of Binding Regions.
#'
#' Numeric S3 generic case for assigning binding regions.
#'
#' @param template.df Template data frame.
#' @param fw Binding region data forward primers.
#' @param rev Binding region data for reverse primers.
#' @param optimize.region Should the primer binding region be optimized using secondary structure prediction?
#' @param primer.length Probe length for optimizing template secondary structure.
#' @return The template data frame with assigned binding regions.
#' @export
#' @keywords internal
assign_binding_regions.numeric <- function(template.df, fw, rev, optimize.region = FALSE,
primer.length = 20, gap.char = "-") {
if (missing(rev)) {
rev <- NULL
}
if (length(fw) > 2 || length(rev) > 2 ||
length(fw) == 1 || length(rev) == 1) {
stop("fw or rev were no intervals.")
}
template.df$Sequence <- ungap_sequence(template.df$InputSequence, gap.char)
l.seq <- create.uniform.leaders(fw, rev, template.df, gap.char)
template.df <- add.uniform.leaders.to.seqs(template.df, l.seq)
# optimization of regions:
if (optimize.region) {
if (length(fw) != 0 && length(rev) != 0) {
mode.directionality <- "both"
} else if (length(fw) != 0) {
mode.directionality <- "fw"
} else {
mode.directionality <- "rev"
}
opti.regions <- optimize.template.binding.regions.dir(template.df, NULL, primer.length, mode.directionality)
opti.intervals <- opti.regions$Intervals
template.df <- update.binding.regions(template.df, opti.intervals)
}
return(template.df)
}
#' Character Assignment of Binding Regions
#'
#' Generic method for assigning the binding region using individual binding regions.
#'
#' @param template.df Template data frame.
#' @param fw FASTA file specifying the forward binding regions.
#' @param rev FASTA file specifying the reverse binding regions.
#' @param optimize.region Should the primer binding region be optimized using secondary structure prediction?
#' @param primer.length Probe length for optimizing template secondary structure.
#' @param gap.char The gap character for aligned sequences.
#' @return Template data frame with assigned binding regions.
#' @export
#' @keywords internal
assign_binding_regions.character <- function(template.df, fw, rev, optimize.region = FALSE,
primer.length = 20, gap.char = "-") {
fw.df <- read.leaders(fw, "fw", gap.character = gap.char)
rev.df <- read.leaders(rev, "rev", gap.character = gap.char)
uni.leaders <- unify.leaders(fw.df, rev.df, template.df, gap.char)
template.df <- get.leader.exon.regions(template.df, uni.leaders)
# optimization of regions:
if (optimize.region) {
if (length(fw) != 0 && length(rev) != 0) {
mode.directionality <- "both"
} else if (length(fw) != 0) {
mode.directionality <- "fw"
} else {
mode.directionality <- "rev"
}
opti.regions <- optimize.template.binding.regions.dir(template.df, NULL, primer.length, mode.directionality)
opti.intervals <- opti.regions$Intervals
template.df <- update.binding.regions(template.df, opti.intervals)
}
return(template.df)
}
#' Update of Binding Regions.
#'
#' Updates the binding regions in the templates by providing
#' new intervals for forward and reverse binding regions.
#'
#' @param template.df An object of class \code{Templates}.
#' @param opti.regions List with new binding intervals.
#' The list can contain the components \code{fw} and \code{rev} providing
#' numeric vectors of length 2 providing the start and end
#' of the binding regions in the templates, for forward
#' and reverse binding regions, respectively.
#' @return A \code{Templates} object with updated binding regions.
#' @keywords internal
update.binding.regions <- function(template.df, opti.regions) {
# updates binding regions
if ("fw" %in% names(opti.regions)) {
if (length(opti.regions[["fw"]]) != 0) {
starts <- sapply(opti.regions[["fw"]], function(x) x[1])
sel <- !is.na(starts) # only modify those where we could find a possible region ...
template.df$Allowed_Start_fw[sel] <- starts[sel]
template.df$Allowed_End_fw[sel] <- sapply(opti.regions[["fw"]], function(x) x[2])[sel]
template.df$Allowed_fw[sel] <- substring(template.df$Sequence[sel], template.df$Allowed_Start_fw[sel],
template.df$Allowed_End_fw[sel])
}
}
if ("rev" %in% names(opti.regions)) {
if (length(opti.regions[["rev"]]) != 0) {
starts <- sapply(opti.regions[["rev"]], function(x) x[1])
sel <- !is.na(starts)
template.df$Allowed_Start_rev[sel] <- starts[sel]
template.df$Allowed_End_rev[sel] <- sapply(opti.regions[["rev"]], function(x) x[2])[sel]
template.df$Allowed_rev[sel] <- substring(template.df$Sequence[sel], template.df$Allowed_Start_rev[sel],
template.df$Allowed_End_rev[sel])
}
}
return(template.df)
}
#' Ungapping of Sequences.
#'
#' Removes gaps from the input sequences.
#'
#' @param seqs The input character vector with sequences
#' @param gap.char The character used to represent gaps.
#' @return \code{seqs} with gaps removed.
#' @keywords internal
ungap_sequence <- function(seqs, gap.char = "-") {
seqs <- gsub(gap.char, "", seqs)
return(seqs)
}
binding.interval.ungapped <- function(template.df, start, end, gap.char) {
# adjust the binding positions with gaps to those without gaps
# gap count from start of seq until the end of the allowed region
gap.count.before <- sapply(strsplit(substring(template.df$InputSequence, 1, start), split = ""),
function(x) length(which(x == gap.char)))
gap.count.within <- sapply(strsplit(substring(template.df$InputSequence, start, end), split = ""),
function(x) length(which(x == gap.char)))
start <- start - gap.count.before
end <- end - gap.count.before - gap.count.within
return(list(start, end))
}
#' Retrieval of Binding Regions
#'
#' Retrives information about individual binding regions.
#'
#' @param template.df Template data frame.
#' @param direction Identify forward and reverse.
#' @param start Start positions.
#' @param end End positions.
#' @param gap.char The character for gaps in alignments.
#' @param init.from.leader Whether the binding regions are initialized
#' from leader sequences.
#' @return Data frame with information on allowed binding regions.
#' @keywords internal
retrieve.leader.region <- function(template.df, direction = c("fw", "rev"), start, end, gap.char,
init.from.leader) {
if (length(start) == 0 || length(end) == 0) {
return(NULL)
}
if (length(direction) == 0) {
stop("Please provide the 'direction' argument.")
}
direction <- match.arg(direction)
if (length(start) == 1 && length(end) == 1) {
# start and end refer to intervals (uniform range)
start <- rep(start, nrow(template.df))
end <- rep(end, nrow(template.df))
} # otherwise: regions are already ok in the input.
if (direction == "rev" && !init.from.leader) {
# only for uniform binding regions: need to change the interpretation of rev positions
ori.start <- start
start <- nchar(template.df$InputSequence) - end + 1
end <- nchar(template.df$InputSequence) - ori.start + 1
}
# store start/end in possibly gappy input
ali.start <- start
ali.end <- end
# gap count from start of seq until the end of the allowed region
# adjust binding region in the ungapped sequence according to the number of gaps in between
new.interval <- binding.interval.ungapped(template.df, start, end, gap.char)
# work with start/end in ungapped seqs
start <- new.interval[[1]]
end <- new.interval[[2]]
# use the ungapped sequences now to work on the correct positions
leader <- substring(template.df$Sequence, start, end)
# deal with incorrect positions
idx <- which(start < 0 & end < 0) # interval is not in range -> no leader available
if (length(idx) != 0) {
start[idx] <- NA
end[idx] <- NA
}
idx <- which(end < 0) # only end is too small
if (length(idx) != 0) {
end[idx] <- 0 # no leader available
}
idx <- which(start < 0) # only start is too early
if (length(idx) != 0) {
start[idx] <- 1
}
idx <- which(unlist(lapply(seq_along(end), function(x) end[x] > (nchar(template.df$Sequence[x]) +
1) && start[x] > (nchar(template.df$Sequence[x]) + 1))))
if (length(idx) != 0) {
# not in the defined range
end[idx] <- NA
start[idx] <- NA
}
idx <- which(unlist(lapply(seq_along(start), function(x) start[x] > (1 + nchar(template.df$Sequence[x]))))) #only start is larger than the range
if (length(idx) != 0) {
start[idx] <- 0
end[idx] <- 0
}
idx <- which(unlist(lapply(seq_along(end), function(x) end[x] > (1 + nchar(template.df$Sequence[x]))))) #only end is larger than the range
if (length(idx) != 0) {
end[idx] <- nchar(template.df$Sequence)[idx]
}
df <- build_leader_df(direction, leader, start, end, ali.start, ali.end)
return(df)
}
#' Building of Leader Data Frame.
#'
#' Constructs the leader data frame.
#'
#' @param direction The primer direction for which we are annotating binding regions.
#' @param leader The binding region sequence.
#' @param start The start positions of the binding region.
#' @param end The end positions of the binding region.
#' @param ali.start The start positions of the binding region in the aligned input.
#' @param ali.end The end positions of the binding region in the aligned input.
#' @return A data frame with binding region information.
#' @keywords internal
build_leader_df <- function(direction = c("fw", "rev"), leader, start, end,
ali.start, ali.end) {
if (length(direction) == 0) {
stop("Please provide the 'direction' argument.")
}
direction <- match.arg(direction)
# initial: stored such that binding range can be modified later in relation to initial configuration
df <- data.frame(Allowed_fw = character(length(leader)), # binding region string
Allowed_rev = character(length(leader)),
# binding region intervals:
Allowed_Start_fw = numeric(length(start)),
Allowed_Start_rev = numeric(length(start)),
Allowed_End_fw = numeric(length(end)),
Allowed_End_rev = numeric(length(end)),
# the binding region in gapped sequence positions from alignemnts
Allowed_Start_fw_ali = numeric(length(start)),
Allowed_Start_rev_ali = numeric(length(start)),
Allowed_End_fw_ali = numeric(length(end)),
Allowed_End_rev_ali = numeric(length(end)),
# initial: the initially specified binding region (without gaps)
Allowed_Start_fw_initial = numeric(length(start)),
Allowed_Start_rev_initial = numeric(length(start)),
Allowed_End_fw_initial = numeric(length(end)),
Allowed_End_rev_initial = numeric(length(end)),
# initial positions in aligned input:
Allowed_Start_fw_initial_ali = numeric(length(start)),
Allowed_Start_rev_initial_ali = numeric(length(start)),
Allowed_End_fw_initial_ali = numeric(length(end)),
Allowed_End_rev_initial_ali = numeric(length(end)),
stringsAsFactors = FALSE)
if (direction == "rev") {
df$Allowed_rev <- leader
df$Allowed_Start_rev <- start
df$Allowed_End_rev <- end
df$Allowed_Start_rev_ali <- ali.start
df$Allowed_End_rev_ali <- ali.end
df$Allowed_Start_rev_initial <- df$Allowed_Start_rev
df$Allowed_End_rev_initial <- df$Allowed_End_rev
df$Allowed_Start_rev_initial_ali <- df$Allowed_Start_rev_ali
df$Allowed_End_rev_initial_ali <- df$Allowed_End_rev_ali
} else {
df$Allowed_fw <- leader
df$Allowed_Start_fw <- start
df$Allowed_End_fw <- end
df$Allowed_Start_fw_ali <- ali.start
df$Allowed_End_fw_ali <- ali.end
df$Allowed_Start_fw_initial <- df$Allowed_Start_fw
df$Allowed_End_fw_initial <- df$Allowed_End_fw
df$Allowed_Start_fw_initial_ali <- df$Allowed_Start_fw_ali
df$Allowed_End_fw_initial_ali <- df$Allowed_End_fw_ali
}
# we return only the modified binding direction such that
# previous settings in template.df are not overwritten:
sel.cols <- grepl(direction, colnames(df))
df <- df[,sel.cols]
return(df)
}
#' Uniform Binding Ranges.
#'
#' Creates uniform binding regions for all templates.
#'
#' @param fw.interval Binding region for forward templates.
#' @param rev.interval Binding region for reverse templates.
#' @param template.df Template data frame.
#' @param gap.char The character for gaps in alignments.
#' @return Data frame with binding region information.
#' @keywords internal
create.uniform.leaders <- function(fw.interval, rev.interval, template.df, gap.char) {
if (length(fw.interval) == 0 && length(rev.interval) == 0) {
warning("No fw/rev intervals were specified for specifying a uniform binding range.")
}
if (length(fw.interval) == 2) {
if (fw.interval[1] > fw.interval[2]) {
stop("Forward binding region was no proper interval.")
}
} else if (length(fw.interval) > 0) {
stop("Forward binding region was no proper interval.")
}
if (length(rev.interval) == 2) {
if (rev.interval[1] > rev.interval[2]) {
stop("Reverse binding region was no proper interval.")
}
} else if (length(rev.interval) > 1) {
stop("Reverse binding region was no proper interval.")
}
fw.leaders <- retrieve.leader.region(template.df, "fw", fw.interval[1], fw.interval[2], gap.char, FALSE)
rev.leaders <- retrieve.leader.region(template.df, "rev", rev.interval[1], rev.interval[2], gap.char, FALSE)
if (length(fw.leaders) == 0) {
result <- rev.leaders
} else if (length(rev.leaders) == 0) {
result <- fw.leaders
} else {
result <- cbind(fw.leaders[,grep("_fw", colnames(fw.leaders))],
rev.leaders[, grep("_rev", colnames(rev.leaders))])
}
col.order <- c("Allowed_Start_fw", "Allowed_End_fw", "Allowed_Start_rev", "Allowed_End_rev",
"Allowed_fw", "Allowed_rev")
all.cols <- c(col.order[col.order %in% colnames(result)], setdiff(colnames(result),
col.order))
result <- result[, all.cols]
return(result)
}
#' Add Uniform Binding Regions.
#'
#' Augments a template data frame with uniform binding regions.
#'
#' @param lex.seq Template data frame
#' @param leaders Data frame with uniform binding regions.
#' @return Template data frame with updated binding regions.
#' @keywords internal
add.uniform.leaders.to.seqs <- function(lex.seq, leaders) {
result <- lex.seq
if ("TemplateIdentifier" %in% colnames(leaders)) {
# need to map from leader annotation to templates
m <- match(leaders$TemplateIdentifier, lex.seq$Identifier)
} else {
# leader annotation is not a subset of the templates
m <- seq_len(nrow(leaders))
}
for (i in colnames(leaders)) {
result[m, i] <- leaders[, i]
}
return(result)
}
#' Read Individual Binding Regions
#'
#' Reads individual binding regions into a data frame.
#'
#' @param fasta.file Path to a FASTA file with binding regions.
#' @param direction String identifying whether the FASTA file
#' contains information pertaining to the binding region of forward
#' or reverse primers.
#' @param rm.keywords A vector of keywords that are used to remove templates whose headers contain any of the keywords.
#' @param gap.character The character for indicating gaps in sequences.
#' @return A data frame with individual binding regions.
#' @keywords internal
read.leaders <- function(fasta.file, direction = c("fw", "rev"), rm.keywords = NULL, gap.character) {
if (length(fasta.file) == 0) {
return(NULL)
}
d <- read.sequences(fasta.file, gap.character)
if (direction == "fw") {
colnames(d)[which(colnames(d) == "Sequence")] <- "Allowed_fw"
} else {
colnames(d)[which(colnames(d) == "Sequence")] <- "Allowed_rev"
}
if (length(rm.keywords) != 0) {
d <- remove.seqs.by.keyword(d, rm.keywords)
}
return(d)
}
#' Parser for IMGT Groups.
#'
#' Parses IMGT group information contained in FASTA headers.
#'
#' @param IDs Group information strings to be parsed.
#' @return Data frame with structured group information.
#' @keywords internal
parse.IMGT.gene.groups <- function(IDs) {
# regexp: (gene)-(subgroup)-(localization in the locus)-(extracted_labels)
exp <- "([^-]+)-(.+)(\\*[^\\|]+)"
# some genes are ORs (orphons) -> still the same gene group? orphons appear
# outside the main genetic loci but are still assigned to groups .. clan
# annotations: clan1: ighv1, ighv5, ighv7
idx <- stringr::str_match(IDs, exp)
if (any(apply(idx, 1, is.na))) {
# at least one entry not in IMGT format
return(NULL)
}
# remove OR annotation from GeneGroup
colnames(idx) <- c("Full", "Gene", "Subgroup", "Allele")
s <- strsplit(idx[, "Gene"], split = "/")
g <- sapply(s, function(x) x[1])
idx[, "Gene"] <- g
# remove localization from Subgroup
s <- strsplit(idx[, "Subgroup"], "Subgroup", split = "-")
g <- sapply(s, function(x) x[1])
idx[, "Subgroup"] <- g
identifier <- paste(idx[, "Gene"], idx[, "Subgroup"], sep = "-")
result <- data.frame(Group_Identifier = identifier, Gene_Group = idx[, "Gene"],
Sub_Group = idx[, "Subgroup"], Allele = idx[, "Allele"],
stringsAsFactors = FALSE)
# message(result)
return(result)
}
#' Unification of Leaders
#'
#' Unifies individual binding regions for forward and reverse primers.
#'
#' @param l.seq.fw Data frame with binding information for forward primers.
#' @param l.seq.rev Data frame with binding information for reverse primers.
#' @param lex.seq Template data frame.
#' @param gap.char The character for indicating alignment gaps.
#' @return Template data frame with annotated binding regions.
#' @keywords internal
unify.leaders <- function(l.seq.fw, l.seq.rev, lex.seq, gap.char) {
fw.df <- NULL
rev.df <- NULL
if (length(l.seq.fw) == 0 && length(l.seq.rev) == 0) {
return(lex.seq)
}
template.df <- lex.seq
if (length(l.seq.fw) != 0) {
fw.df <- get.leader.exon.regions.single(l.seq.fw, lex.seq, "fw", gap.char)
template.df <- add.uniform.leaders.to.seqs(template.df, fw.df)
}
if (length(l.seq.rev) != 0) {
rev.df <- get.leader.exon.regions.single(l.seq.rev, lex.seq, "rev", gap.char)
template.df <- add.uniform.leaders.to.seqs(template.df, rev.df)
}
template.df$Allowed_Start_fw_initial <- template.df$Allowed_Start_fw
template.df$Allowed_End_fw_initial <- template.df$Allowed_End_fw
template.df$Allowed_End_rev_initial <- template.df$Allowed_End_rev
template.df$Allowed_Start_rev_initial <- template.df$Allowed_Start_rev
template.df$Sequence <- ungap_sequence(template.df$InputSequence, gap.char) # use the ungapped seq from now on only
return(template.df)
}
#' Assign Binding Regions
#'
#' Augments a template data frame with individual binding regions.
#'
#' @param lex.seqs Data frame with template sequences.
#' @param uni.leaders Data frame with individual allowed binding regions.
#' @return Template data frame with annotated binding regions.
#' @keywords internal
get.leader.exon.regions <- function(lex.seqs, uni.leaders) {
if (length(uni.leaders) == 0) {
return(NULL)
}
# add missing columns
if (!"Allowed_Start_fw" %in% colnames(uni.leaders)) {
uni.leaders$Allowed_Start_fw <- lex.seqs$Allowed_Start_fw
uni.leaders$Allowed_End_fw <- lex.seqs$Allowed_End_fw
uni.leaders$Allowed_fw <- ""
uni.leaders$Allowed_Start_fw_initial <- uni.leaders$Allowed_Start_fw
uni.leaders$Allowed_End_fw_initial <- uni.leaders$Allowed_End_fw
}
if (!"Allowed_Start_rev" %in% colnames(uni.leaders)) {
uni.leaders$Allowed_Start_rev <- lex.seqs$Allowed_Start_rev
uni.leaders$Allowed_End_rev <- lex.seqs$Allowed_End_rev
uni.leaders$Allowed_rev <- ""
uni.leaders$Allowed_Start_rev_initial <- uni.leaders$Allowed_Start_rev
uni.leaders$Allowed_End_rev_initial <- uni.leaders$Allowed_End_rev
}
# add entries from lex.seqs, where uni.leaders didn't have anything
fw.idx <- which(uni.leaders$Allowed_fw == "")
rev.idx <- which(uni.leaders$Allowed_rev == "")
if (length(fw.idx) != 0) {
uni.leaders$Allowed_Start_fw[fw.idx] <- lex.seqs$Allowed_Start_fw[fw.idx]
uni.leaders$Allowed_End_fw[fw.idx] <- lex.seqs$Allowed_End_fw[fw.idx]
uni.leaders$Allowed_Start_fw_initial[fw.idx] <- lex.seqs$Allowed_Start_fw[fw.idx]
uni.leaders$Allowed_End_fw_initial[fw.idx] <- lex.seqs$Allowed_End_fw[fw.idx]
}
if (length(rev.idx) != 0) {
uni.leaders$Allowed_Start_rev[rev.idx] <- lex.seqs$Allowed_Start_rev[rev.idx]
uni.leaders$Allowed_End_rev[rev.idx] <- lex.seqs$Allowed_End_rev[rev.idx]
uni.leaders$Allowed_Start_rev_initial[rev.idx] <- lex.seqs$Allowed_Start_rev[rev.idx]
uni.leaders$Allowed_End_rev_initial[rev.idx] <- lex.seqs$Allowed_End_rev[rev.idx]
}
# remove duplicate cols
dup.idx <- which(duplicated(colnames(uni.leaders)))
if (length(dup.idx) != 0) {
uni.leaders <- uni.leaders[, -dup.idx]
}
return(uni.leaders)
}
#' Individual Binding Annotation
#'
#' Annotate individual binding regions.
#'
#' @param l.seq Data frame with individual binding regions.
#' @param lex.seq Template data frame.
#' @param direction The primer direction for which the binding info is valid.
#' @param gap.char The character for gaps in alignments.
#' @return Template data frame with annotated binding regions.
#' @keywords internal
get.leader.exon.regions.single <- function(l.seq, lex.seq,
direction = c("fw", "rev"), gap.char) {
if (length(direction) == 0) {
stop("Please provide the 'direction' argument.")
}
direction <- match.arg(direction)
# match leader sequences to lex seqs
# try to match accurately:
idx <- lapply(lex.seq$Header, function(x) match(x, l.seq$Header)) # hit from seqs to leaders
any.missing <- any(unlist(lapply(idx, function(x) is.na(x))))
if (any.missing) {
# could not match template header to leader headers, try approximate matching
# NB: substring here is a necessary hack for mapping IMGT leaders to seqs
idx <- lapply(lex.seq$Header, function(x) grep(substring(x, 1, 30), l.seq$Header, fixed = TRUE))
}
# ensure that each template has a unique leader
hit.len <- sapply(idx, length)
nbr.of.hits <- sum(hit.len)
if (any(hit.len == 0 | hit.len > 1)) {
if (any(hit.len == 0)) {
error.msg <- "Some templates did not have a corresponding binding region. The binding regions of templates for which no allowed region was specified were not changed."
my.idx <- which(hit.len == 0)
error.msg <- paste(error.msg, "The IDs of the sequences are:\n", paste(lex.seq$ID[my.idx],
collapse = "\n", sep = ""))
my.warning("MissingLeaders", error.msg)
} else {
error.msg <- "Some templates had more than one corresponding binding region. Only the first specified binding region was considered in these cases."
my.idx <- which(hit.len > 1)
error.msg <- paste(error.msg, "The IDs of the sequences are:\n", paste(lex.seq$ID[my.idx],
collapse = "\n", sep = ""))
my.warning("RedundantLeaders", error.msg)
}
} else if (nbr.of.hits != nrow(l.seq)) {
# not all leaders were matched to a template sequence
used.leaders <- unique(unlist(idx))
not.used.leaders <- setdiff(seq_len(nrow(l.seq)), used.leaders)
leader.info <- paste("Leaders with the following headers were not used: ",
paste(l.seq$Header[not.used.leaders], collapse = ", "), sep = "")
my.warning("Not_all_leaders_matched", "Some allowed regions did not match any template header and were ignored.")
}
# select indices that are available if multiple leader regions are given for one
# sample, take the 1st one arbtirarily
sel.idx.zero <- which(sapply(idx, length) == 0)
idx <- sapply(seq_along(idx), function(x) if (x %in% sel.idx.zero) {
NA
} else {
idx[[x]][1]
})
sel.idx.lex <- which(!is.na(idx)) # ids of the templates where a leader seq was found
idx <- unlist(idx)
if (length(sel.idx.lex) == 0) {
my.error("Leaders_no_matches", "No matches between templates and allowed binding regions. No binding regions were annotated")
}
sel.idx <- idx[sel.idx.lex] # ids of the leader.seqs with a template
######
template.df <- lex.seq[sel.idx.lex, ]
idx.add <- which(!(colnames(l.seq) %in% colnames(template.df)))
leader.col <- paste("Allowed", "_", direction, sep = "")
template.df[, leader.col] <- l.seq$Allowed[sel.idx] # replace the leader
# rename the other columns that are already present and add
cols <- setdiff(colnames(l.seq), leader.col)
idx.replace <- which(cols %in% colnames(template.df))
if (length(idx.replace) != 0) {
template.df[, cols[idx.replace]] <- l.seq[sel.idx, cols[idx.replace]]
}
# add the new columns also ..
if (length(idx.add) != 0) {
template.df[, colnames(l.seq)[idx.add]] <- l.seq[sel.idx, idx.add]
}
# find leader position in sequences with exact matching if we have a forward
# restriction, select the first match we find, otherwise select the last match we
# find
leader.pos <- lapply(seq_len(nrow(template.df)), function(x) gregexpr(template.df[x, leader.col],
template.df$InputSequence[x]))
leader.idx <- NULL
if (direction == "fw") {
# always take the first match we find
leader.idx <- rep(1, length(leader.pos))
} else {
# always take the last match we find
leader.idx <- unlist(lapply(seq_along(leader.pos), function(x) length(unlist(leader.pos[[x]]))))
}
leader.s <- sapply(seq_along(leader.pos), function(x) as.numeric(unlist(leader.pos[[x]])[leader.idx[x]]))
leader.e <- sapply(seq_along(leader.pos), function(x) leader.s[x] + attr(leader.pos[[x]][[1]],
"match.length")[leader.idx[x]] - 1)
# verify that all leader.s and leader.e are defined
leader.s.ok <- leader.s > 0
leader.e.ok <- leader.e > 0
if (any(!leader.s.ok) || any(!leader.e.ok)) {
warn.msg <- "Some of the input allowed binding sites could not be found in the templates. The affected sequences are:\n"
idx1 <- which(!leader.s.ok)
idx2 <- which(!leader.e.ok)
idx <- union(idx1, idx2)
warn.msg <- paste(warn.msg, paste(template.df$ID[idx], collapse = "\n", sep = ""))
my.warning("LeadersNotFound", warn.msg)
}
final.df <- retrieve.leader.region(template.df, direction, leader.s, leader.e, gap.char, TRUE) # retrieve leader region: overwrites values of other directions
final.df <- cbind(final.df, "TemplateIdentifier" = template.df$Identifier)
if (FALSE) {
if (any(leader.s.ok)) {
# set allowed region start/end
allowed.col.e <- paste("Allowed_End", "_", direction, sep = "")
allowed.col.s <- paste("Allowed_Start", "_", direction, sep = "")
template.df[leader.s.ok, allowed.col.s] <- leader.s[leader.s.ok]
template.df[leader.s.ok, allowed.col.e] <- leader.e[leader.s.ok]
}
}
return(final.df)
}
#' @rdname Output
#' @name Output
#' @aliases write_templates
#' @return \code{write_templates} stores templates to \code{fname}.
#' @export
#' @examples
#'
#' data(Ippolito)
#' # Store templates as FASTA
#' fname.fasta <- tempfile("my_templates", fileext = ".fasta")
#' write_templates(template.df, fname.fasta)
#' # Store templates as CSV
#' fname.csv <- tempfile("my_templates", fileext = ".csv")
#' write_templates(template.df, fname.csv, "CSV")
write_templates <- function(template.df, fname, ftype = c("FASTA", "CSV")) {
ftype <- match.arg(ftype)
if (ftype == "FASTA") {
seqs <- template.df$Sequence
labels <- template.df$ID
seqinr::write.fasta(as.list(seqs), as.list(labels), fname)
} else if (ftype == "CSV") {
write.csv(template.df, file = fname, row.names = FALSE)
} else {
warning("Unknown filetype: ", ftype)
}
}
#' @rdname Scoring
#' @name Scoring
#' @aliases score_conservation
#' @return A list containing \code{Entropies} and \code{Alignments}.
#' \code{Entropies} is a data frame with conservation scores.
#' Each column indicates a position in the alignment of template sequences
#' and each row gives the entropies of the sequences
#' belonging to a specific group of template sequences.
#' \code{Alignments} is a list of \code{DNABin} objects, where each
#' object gives the alignment corresponding to one group of template sequences.
#' @export
#' @note \code{score_conservation} requires the MAFFT software
#' for multiple alignments (http://mafft.cbrc.jp/alignment/software/).
#' @examples
#' \dontrun{
#' data(Ippolito)
#' entropy.data <- score_conservation(template.df, gap.char = "-", win.len = 18, by.group = TRUE)
#' }
score_conservation <- function(template.df, gap.char = "-",
win.len = 30, by.group = TRUE) {
# only align sequences if not aligned already
if (!check.tool.function()["MAFFT"]) {
stop("MAFFT is required for scoring the conservation (http://mafft.cbrc.jp/alignment/software/).")
}
if (!any(grepl(gap.char, template.df$InputSequence))) {
# no gap char found -> sequences are not aligned yet
message("Aligning sequences ...")
ali <- align.seqs(template.df$InputSequence, template.df$ID)
} else {
message("Assuming that the sequences are pre-aligned.")
ali <- seqinr::as.alignment(nb = length(template.df$InputSequence),
nam = template.df$ID,
seq = template.df$InputSequence)
}
if (is.null(ali)) {
# alignment wasn't possible
return(NULL)
}
# sanitize the sequences: (carriage returns cause problems in windows)
ali$seq <- gsub("\r", "", ali$seq)
bin <- ape::as.DNAbin(ali)
# create groups in the alignment according to user-anntotation
# may automatic grouping in the future (clustering?)
if (by.group) {
groups <- unique(template.df$Group)
bins <- lapply(seq_along(groups), function(x) bin[which(template.df$Group == groups[x]),])
} else {
groups <- "default"
bins <- list(bin)
}
# determine conservation according to Shannon entropy for each alignment group
ali.entropy <- lapply(bins, function(x) shannon.entropy(as.character(x)))
# create a data frame representation
entropy.df <- do.call(rbind, ali.entropy)
rownames(entropy.df) <- groups
colnames(entropy.df) <- seq_len(ncol(entropy.df))
names(bins) <- groups
out <- list("Alignments" = bins, "Entropies" = entropy.df)
return(out)
}
#' Identification of Gappy Columns in Alignments.
#' @param bins A list of \code{DNABin} alignments.
#' @param gap.cutoff The required percentage of gaps
#' for consideration as a gap column.
#' @param gap.char The gap character in the alignments.
#' @return A list with indices giving the gap columns
#' for every alignment in \code{bins}.
#' @keywords internal
detect.gap.columns <- function(bins, gap.cutoff = 0.95, gap.char = "-") {
gap.cutoff <- 0.95 # if there is a column 95% gaps, ignore the whole region
gap.freq <- lapply(bins, function(x) {
mat <- as.character(x)
count <- unlist(lapply(seq_len(ncol(mat)), function(y)
length(which(mat[,y] == gap.char))))
freq <- count / nrow(mat)
})
# exclude regions with gap percentage above cutoff -> useless to construct primers here
mask.idx <- lapply(gap.freq, function(x) which(x >= gap.cutoff))
return(mask.idx)
}
#' @rdname Plots
#' @name Plots
#' @aliases plot_conservation
#' @return \code{plot_conseration} returns a plot showing the degree of sequence conservation in the templates.
#' @export
#' @note Computing the conservation scores for using \code{plot_conservation} requires
#' the MAFFT software for multiple alignments (http://mafft.cbrc.jp/alignment/software/).
#' @examples
#'
#' data(Ippolito)
#' # Select binding regions for every group of templates and plot:
#' template.df <- select_regions_by_conservation(template.df, win.len = 30)
#' if (length(template.df) != 0) {
#' p1 <- plot_conservation(attr(template.df, "entropies"),
#' attr(template.df, "alignments"), template.df)
#' }
#' # Select binding regions for all templates and plot:
#' data(Ippolito)
#' template.df <- select_regions_by_conservation(template.df, by.group = FALSE)
#' if (length(template.df) != 0) {
#' p2 <- plot_conservation(attr(template.df, "entropies"),
#' attr(template.df, "alignments"), template.df)
#' }
plot_conservation <- function(entropy.df, alignments, template.df, gap.char = "-") {
# set gappy columns to 0 conservation for all groups
gap.idx <- detect.gap.columns(alignments, gap.cutoff = 0.95, gap.char = gap.char)
m <- match(rownames(entropy.df), names(gap.idx))
for (i in seq_len(nrow(entropy.df))) { # for each alignment group
entropy.df[i, gap.idx[[m[i]]]] <- 1 # max entropy
}
plot.df <- melt(entropy.df, value.name = "Entropy")
# since we have [0,1] interval entropies, we can simply convert to conservation:
colnames(plot.df) <- c("Group", "Position", "Entropy")
plot.df$Conservation <- 1 - plot.df$Entropy
# get region data for every group
get.region.df <- function(allowed.start.fw.old, allowed.end.fw.old,
allowed.start.fw.new, allowed.end.fw.new,
allowed.start.rev.old, allowed.end.rev.old,
allowed.start.rev.new, allowed.end.rev.new) {
# select unique interval here, but how?
fw.ori.range <- unique(cbind(allowed.start.fw.old,
allowed.end.fw.old))
fw.ori.df <- data.frame("Start" = fw.ori.range[,1],
"End" = fw.ori.range[,2],
"Direction" = "fw",
"Type" = "Old")
fw.new.range <- unique(cbind(allowed.start.fw.new,
allowed.end.fw.new))
fw.new.df <- data.frame("Start" = fw.new.range[,1],
"End" = fw.new.range[,2],
"Direction" = "fw",
"Type" = "New")
rev.ori.range <- unique(cbind(allowed.start.rev.old,
allowed.end.rev.old))
rev.ori.df <- data.frame("Start" = rev.ori.range[,1],
"End" = rev.ori.range[,2],
"Direction" = "rev",
"Type" = "Old")
rev.new.range <- unique(cbind(allowed.start.rev.new,
allowed.end.rev.new))
rev.new.df <- data.frame("Start" = rev.new.range[,1],
"End" = rev.new.range[,2],
"Direction" = "rev",
"Type" = "New")
range.df <- rbind(fw.ori.df, fw.new.df,
rev.ori.df, rev.new.df)
return(range.df)
}
groups <- names(alignments)
allowed.df <- vector("list", length(groups))
for (i in seq_along(alignments)) {
if (groups[i] == "default") {
# capture all templates
sub.df <- template.df
} else {
# select individual groups
sub.df <- template.df[template.df$Group == groups[i],]
}
region.df <- get.region.df(sub.df$Allowed_Start_fw_initial_ali,
sub.df$Allowed_End_fw_initial_ali,
sub.df$Allowed_Start_fw_ali,
sub.df$Allowed_End_fw_ali,
sub.df$Allowed_Start_rev_initial_ali,
sub.df$Allowed_End_rev_initial_ali,
sub.df$Allowed_Start_rev_ali,
sub.df$Allowed_End_rev_ali)
region.df$Group <- groups[i]
allowed.df[[i]] <- region.df
}
allowed.df <- do.call(rbind, allowed.df)
group.colors <- brewer.pal(8, "Set1")
group.colors <- colorRampPalette(group.colors)(length(groups))
group.colors <- c(group.colors, "grey40")
# only plot the new allowed regions
allowed.new <- allowed.df[allowed.df$Type == "New",]
allowed.old <- allowed.df[allowed.df$Type == "Old",]
# get max bar height for any group for a position
#max.conservation <- ddply(plot.df, c("Position"), summarize, "TotalConservation" = sum(substitute(Conservation)))
#box.extent <- max(max.conservation$TotalConservation) * 0.1
box.extent <- max(plot.df$Conservation) * 0.1
box.ymax <- -0.05 * max(plot.df$Conservation)
box.ymin <- box.ymax - box.extent
# select only unique old regions
allowed.old <- allowed.old[row.names(unique(allowed.old[, c("Start", "End")])), ]
p <- ggplot() +
geom_bar(data = plot.df, aes_string(x = "Position",
y = "Conservation",
fill = "Conservation"),
stat = "identity",
position = "stack") +
xlab("Position") +
ylab("Conservation") +
scale_y_continuous(labels = scales::percent) +
facet_wrap(~Group, ncol = 2)
# add rectangles for binding region annotation
p <- p + geom_rect(data = allowed.new,
aes_string(xmin = "Start",
xmax = "End",
ymax = box.ymax,
ymin = box.ymin),
#color = "Type",
alpha = 1) +
# don't plot old regions: may not be updated to aligned binding regions
#geom_rect(data = allowed.old,
#aes_string(xmin = "Start",
#xmax = "End",
#fill = "Type",
#ymax = box.ymax,
#ymin = box.ymin),
#alpha = 0.2) +
#scale_fill_manual(values = group.colors) +
scale_fill_gradient()
ggtitle("Sequence conservation")
return(p)
}
#' Updates Binding Region in the Alignment by conservation.
#' @param template.df A \code{Templates} object.
#' @param bins A list with \code{DNAbin} alignments, one for each group of template sequences.
#' @param entropy.df A data frame with entropy information.
#' @param gap.char The gap character for alignments.
#' @param win.len The desired length of the new binding region.
#' @param direction The direction for which the binding range shall be adjusted.
#' @return A \code{Templates} object with modified binding regions.
#' @keywords internal
update.binding.ranges.by.conservation <- function(template.df,
bins,
entropy.df,
gap.char = "-",
win.len = 30, direction = c("fw", "rev")) {
if (length(direction) == 0) {
stop("Please provide the 'direction' argument.")
}
direction <- match.arg(direction)
if (direction == "fw") {
allowed.region <- cbind(template.df$Allowed_Start_fw_initial_ali,
template.df$Allowed_End_fw_initial_ali)
} else {
allowed.region <- cbind(template.df$Allowed_Start_rev_initial_ali,
template.df$Allowed_End_rev_initial_ali)
}
step.size <- 1 # shift each window by 5 positions for speedup of computations
primer.range <- create.primer.ranges(allowed.region[,2], rep(win.len, nrow(allowed.region)),
start.position = allowed.region[,1], step.size = step.size,
groups = template.df$Group)
primer.ranges <- parallel::mclapply(seq_along(bins),
function(x) select.primer.region.by.conservation(primer.range[primer.range$Group == names(bins)[x], ],
entropy.df[x,], 0.1, bins[[x]], gap.char) # select top 10% within the allowed region
)
# annotate ranges with groups
for (i in seq_along(primer.ranges)) {
primer.ranges[[i]] <- cbind(Group = names(bins)[i], primer.ranges[[i]]) # there was an error here ..
}
ranges <- do.call(rbind, primer.ranges)
# select range with smallest entropy for every group
selected.ranges <- ddply(ranges, "Group", function(x) arrange(x, substitute(Entropy))[1,])
# assign new binding regions for every group of template sequences individually
new.templates <- template.df
for (i in seq_along(selected.ranges$Group)) {
group <- selected.ranges$Group[i]
ali <- as.character(bins[[i]])
ali.seqs <- unlist(lapply(seq_len(nrow(ali)),function(x) paste0(ali[x,], collapse = "")))
# identify templates belonging to this group
idx <- which(template.df$Group == group)
sub.df <- template.df[idx,]
# set the new aligned sequence
sub.df$InputSequence <- ali.seqs
region <- c(selected.ranges[i, "Start"], selected.ranges[i, "End"])
if (direction == "fw") {
region.fw <- region
region.rev <- NULL
} else {
region.fw <- NULL
# convert from absolute posi to relative posi from the end of the seq
region.rev <- rev(ncol(ali) - region)
}
sub.df <- assign_binding_regions(sub.df, region.fw, region.rev)
# store old (initial binding regions)
if (direction == "fw") {
sub.df$Allowed_Start_fw_initial_ali <- template.df[idx, "Allowed_Start_fw_initial_ali"]
sub.df$Allowed_End_fw_initial_ali <- template.df[idx, "Allowed_End_fw_initial_ali"]
} else {
sub.df$Allowed_Start_rev_initial_ali <- template.df[idx, "Allowed_Start_rev_initial_ali"]
sub.df$Allowed_End_rev_initial_ali <- template.df[idx, "Allowed_End_rev_initial_ali"]
}
# overwrite template entry
new.templates[idx,] <- sub.df
}
return(new.templates)
}
#' @rdname TemplatesFunctions
#' @name TemplatesFunctions
#' @aliases select_regions_by_conservation
#' @return \code{select_regions_by_conservation} returns a
#' \code{Templates} object with adjusted binding regions.
#' The attribute \code{entropies} gives a data frame with positional entropies
#' and the attribute \code{alignments} gives the alignments of the templates.
#' @export
#' @note \code{select_regions_by_conservation} requires the MAFFT software
#' for multiple alignments
#' (http://mafft.cbrc.jp/alignment/software/).
#' @examples
#' data(Ippolito)
#' new.template.df <- select_regions_by_conservation(template.df)
select_regions_by_conservation <- function(template.df,
gap.char = "-",
win.len = 30, by.group = TRUE,
mode.directionality = c("both", "fw", "rev")) {
if (!is(template.df, "Templates")) {
stop("Please input an object of class 'Templates'.")
}
if (!check.tool.function()["MAFFT"]) {
warning("MAFFT is required for selecting conserved regions (http://mafft.cbrc.jp/alignment/software/).")
return(NULL)
}
direction <- match.arg(mode.directionality)
# note: time could be saved by not computing the entropy of the full sequence, but only the entropy of the possible binding regions?
# note: individual binding regions are not really respected as selection is via groups and not individual sequences.
entropy.data <- score_conservation(template.df, gap.char, win.len, by.group)
if (!by.group) {
# ignore group annotation for updating binding regions
template.df$Group <- "default"
}
bins <- entropy.data$Alignments # list with DNABin objects
entropy.df <- entropy.data$Entropies # list with data frames giving the entropies
if (direction == "fw" || direction == "rev") {
template.df <- update.binding.ranges.by.conservation(template.df, bins, entropy.df, gap.char, win.len, direction)
} else if (direction == "both") {
t.fw <- update.binding.ranges.by.conservation(template.df, bins, entropy.df, gap.char, win.len, "fw")
template.df <- update.binding.ranges.by.conservation(t.fw, bins, entropy.df, gap.char, win.len, "rev")
}
# add additional data (for plotting) as attributes
attr(template.df, "entropies") <- entropy.df
attr(template.df, "alignments") <- bins
return(template.df)
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.