# @title One-hot encode
#
# @description One-hot encode a given DNA sequence.
#
# @param givenSeq A single DNA sequence as character/string.
#
# @return The one-hot encoded sequence.
#
.one_hot_encode_sinuc <- function(givenSeq) {
# Input: A DNA seq as a vector of caharacters (A/C/G/T)
# Returns: A row matrix of
# size 4*seqlen
dna_alphabet <- c("A", "C", "G", "T")
seqlen <- length(givenSeq)
use_colnames <- .get_feat_names(alph=dna_alphabet, k=1, seqlen=seqlen)
if (seqlen > 0) {
one_hot_encoded <- matrix(rep(0, length(dna_alphabet) * seqlen),
nrow = 1,
byrow = TRUE)
# characters to match
one_hot_encoded[, 0 * seqlen + which(givenSeq == dna_alphabet[1])] <-
1
one_hot_encoded[, 1 * seqlen + which(givenSeq == dna_alphabet[2])] <-
1
one_hot_encoded[, 2 * seqlen + which(givenSeq == dna_alphabet[3])] <-
1
one_hot_encoded[, 3 * seqlen + which(givenSeq == dna_alphabet[4])] <-
1
# colnames(one_hot_encoded) <- paste(rep(dna_alphabet,
# each = seqlen), seq_len(seqlen), sep=".")
##
colnames(one_hot_encoded) <- use_colnames
##
return(one_hot_encoded)
} else {
stop("Empty or NULL found")
}
}
## =============================================================================
# @title One-hot encode TRInucleotide profiles
#
# @description One-hot encode the dinucleotide profile of a given DNA sequence.
#
# @param givenSeq A single sequence.
#
# @return The one-hot encoded sequence.
#
.one_hot_encode_trinuc <- function(givenSeq) {
# Input: A DNA seq as a vector of characters (A/C/G/T)
# Returns: A row matrix of
# size 4*seqlen
dna_alphabet <- c("A", "C", "G", "T")
dna_alphabet_trinuc <- do.call(paste0, expand.grid(dna_alphabet,
dna_alphabet,
dna_alphabet))
seqlen <- length(givenSeq)
givenSeq_trinuc <- unlist(lapply(seq_len(seqlen - 2), function(x) {
paste0(givenSeq[x], givenSeq[x + 1], givenSeq[x + 2])
}))
use_colnames <- .get_feat_names(alph=dna_alphabet, k=3, seqlen=seqlen)
if (seqlen > 0) {
one_hot_encoded_trinuc_profile <- matrix(
rep(0, length(dna_alphabet_trinuc) *
seqlen), nrow = 1, byrow = TRUE)
#
for (i in seq_along(dna_alphabet_trinuc)) {
one_hot_encoded_trinuc_profile[, (i - 1) * seqlen +
which(givenSeq_trinuc == dna_alphabet_trinuc[i])] <- 1
}
##
colnames(one_hot_encoded_trinuc_profile) <- use_colnames
##
return(one_hot_encoded_trinuc_profile)
} else {
stop("Empty or NULL found")
}
}
## =============================================================================
.get_feat_names <- function(alph = c('A', 'C', 'G', 'T'), k=1, seqlen){
stopifnot(k<=3)
if(k==1) kmers <- alph
if(k==2) kmers <- get_dimers_from_alphabet(alph)
if(k==3) kmers <- get_trimers_from_alphabet(alph)
use_feat_names <- paste(rep(kmers, each = seqlen), seq_len(seqlen),
sep=".")
use_feat_names
}
## =============================================================================
# @title One-hot encode dinucleotide profiles
#
# @description One-hot encode the dinucleotide profile of a given DNA sequence.
#
# @param givenSeq A single sequence.
#
# @return The one-hot encoded sequence.
#
.one_hot_encode_dinuc <- function(givenSeq) {
# Input: A DNA seq as a vector of caharacters (A/C/G/T)
# Returns: A row matrix of
# size 4*seqlen
dna_alphabet <- c("A", "C", "G", "T")
dna_alphabet_dinuc <- do.call(paste0, expand.grid(dna_alphabet,
dna_alphabet))
seqlen <- length(givenSeq)
givenSeq_dinuc <- unlist(lapply(seq_len(seqlen - 1), function(x) {
paste0(givenSeq[x], givenSeq[x + 1])
}))
use_colnames <- .get_feat_names(alph=dna_alphabet, k=2, seqlen=seqlen)
if (seqlen > 0) {
one_hot_encoded_dinuc_profile <- matrix(
rep(0, length(dna_alphabet_dinuc) *
seqlen), nrow = 1, byrow = TRUE)
#
for (i in seq_along(dna_alphabet_dinuc)) {
one_hot_encoded_dinuc_profile[, (i - 1) * seqlen +
which(givenSeq_dinuc == dna_alphabet_dinuc[i])] <- 1
}
##
colnames(one_hot_encoded_dinuc_profile) <- use_colnames
##
return(one_hot_encoded_dinuc_profile)
} else {
stop("Empty or NULL found")
}
}
## =============================================================================
# @title One-hot decode
#
# @description One-hot decode a given one-hot encoded DNA sequence.
#
# @param oneHotEncodedSeqV A single one-hot encoded sequence vector.
#
# @return The one-hot decoded sequence of ACGTs.
#
.one_hot_decode <- function(oneHotEncodedSeqV) {
dna_alphabet <- c("A", "C", "G", "T")
seqlen <- length(oneHotEncodedSeqV)/length(dna_alphabet)
decodedSeq <- rep("Q", seqlen)
for (alpha_char in seq_along(dna_alphabet)) {
cutp <- seqlen
startp <- (alpha_char * cutp) - cutp + 1
endp <- (alpha_char * cutp)
decodedSeq[which(oneHotEncodedSeqV[startp:endp] == 1)] <-
dna_alphabet[alpha_char]
}
return(paste0(decodedSeq, collapse = ""))
}
## =============================================================================
#' @title Get one-hot encoded sequences
#'
#' @description Get the one-hot encoding representation of the given sequences.
#'
#' @param seqs A \code{\link[Biostrings]{DNAStringSet}} object holding the
#' given DNA sequences
#' @param sinuc_or_dinuc character string, 'sinuc' or 'dinuc' to select for
#' mono- or dinucleotide profiles.
#'
#' @return A sparse matrix of sequences represented with one-hot-encoding
#' @family input functions
#' @seealso \code{\link{prepare_data_from_FASTA}} for generating one-hot
#' encoding of sequences from a FASTA file
#' @importFrom Matrix Matrix
#'
#' @examples
#'
#' fname <- system.file("extdata", "example_data.fa",
#' package = "archR", mustWork = TRUE)
#'
#'
#' rawSeqs <- prepare_data_from_FASTA(fasta_fname = fname,
#' raw_seq = TRUE)
#'
#' seqs_dinuc <- get_one_hot_encoded_seqs(seqs = rawSeqs,
#' sinuc_or_dinuc = "dinuc")
#'
#' @export
get_one_hot_encoded_seqs <- function(seqs, sinuc_or_dinuc = "sinuc") {
#
if (!is.null(seqs) && length(seqs) > 0) {
seqs_split_as_list <-
base::strsplit(as.character(seqs), split = NULL)
if (length(seqs_split_as_list) > 0) {
if (sinuc_or_dinuc == "sinuc") {
.msg_pstr("Generating dinucleotide profiles", flg=TRUE)
encoded_seqs <- lapply(seqs_split_as_list,
.one_hot_encode_sinuc)
} else if (sinuc_or_dinuc == "dinuc") {
.msg_pstr("Generating dinucleotide profiles", flg=TRUE)
encoded_seqs <-
lapply(seqs_split_as_list, .one_hot_encode_dinuc)
} else if (sinuc_or_dinuc == "trinuc") {
.msg_pstr("Generating trinucleotide profiles", flg=TRUE)
encoded_seqs <-
lapply(seqs_split_as_list, .one_hot_encode_trinuc)
}
encoded_seqs <- do.call(rbind, encoded_seqs)
encoded_seqs <- t(encoded_seqs)
## Use Matrix package, store as sparse matrix
## Helps reduce object size and computation time
encoded_seqs <- Matrix::Matrix(encoded_seqs, sparse = TRUE)
##
return(encoded_seqs)
}
} else {
stop("Empty or NULL found")
}
}
## =============================================================================
# @title Assert attributes of sequences
#
# @description Assert the attributes of the sequences provided. This includes
# checking for (1) the length of the sequences, (2) characters in the
# sequences.
#
# @param givenSeqs DNA sequences as a list.
#
#
# @return nothing. Only prints a warning to the screen.
# @importFrom Biostrings width
#
.assert_seq_attributes <- function(givenSeqs) {
# Check that all sequences are of same length
seqs_split_as_list <-
base::strsplit(as.character(givenSeqs), split = NULL)
# length_vals <- unlist(lapply(seqs_split_as_list, length))
length_vals <- levels(as.factor(Biostrings::width(givenSeqs)))
char_levels <- levels(as.factor(unlist(seqs_split_as_list)))
dna_alphabet <- c("A", "C", "G", "T")
if (0 %in% length_vals) {
# Checking sequences of length 0
stop(paste0("Found ", which(0 == length_vals), " sequence(s) of length
zero\n"))
#
} else if (length(levels(as.factor(length_vals))) > 1) {
# Checking all sequences are of same length
stop("Sequences are of different length\n")
#
} else if (any(!(char_levels %in% dna_alphabet))) {
# Check for non-alphabet characters
# Raise either an error or just warn!
warning(c("Non DNA-alphabet character in the sequences: ",
char_levels), immediate. = TRUE)
} else {
# All OK!
.msg_pstr("Sequences OK, ", levels(length_vals)[1], flg=TRUE)
}
}
## =============================================================================
#' @title
#' Generate one-hot encoding of sequences given as FASTA file
#'
#' @description
#' Given a set of sequences in a FASTA file this function returns a sparse
#' matrix with one-hot encoded sequences.
#' In this matrix, the sequence features are along rows, and sequences along
#' columns. Currently, mono- and dinucleotide features for DNA sequences are
#' supported. Therefore, the length of the feature vector is 4 and 16 times
#' the length of the sequences (since the DNA alphabet is four characters)
#' for mono- and dinucleotide features respectively.
#'
#' @param fasta_fname Provide the name (with complete path) of the input
#' FASTA file.
#'
#' @param raw_seq TRUE or FALSE, set this to TRUE if you want the raw sequences.
#'
#' @param sinuc_or_dinuc character string, 'sinuc' or 'dinuc' to select for
#' mono- or dinucleotide profiles.
#'
#' @return A sparse matrix of sequences represented with one-hot-encoding.
#' @family input functions
#' @seealso \code{\link{get_one_hot_encoded_seqs}} for directly using a
#' DNAStringSet object
#' @importFrom Biostrings DNAStringSet
#'
#' @examples
#'
#' fname <- system.file("extdata", "example_data.fa",
#' package = "archR", mustWork = TRUE)
#'
#' # mononucleotides feature matrix
#' rawSeqs <- prepare_data_from_FASTA(fasta_fname = fname,
#' sinuc_or_dinuc = "sinuc")
#'
#' # dinucleotides feature matrix
#' rawSeqs <- prepare_data_from_FASTA(fasta_fname = fname,
#' sinuc_or_dinuc = "dinuc")
#'
#' # FASTA sequences as a Biostrings::DNAStringSet object
#' rawSeqs <- prepare_data_from_FASTA(fasta_fname = fname,
#' raw_seq = TRUE)
#'
#' @export
prepare_data_from_FASTA <- function(fasta_fname, raw_seq = FALSE,
sinuc_or_dinuc = "sinuc") {
if (!file.exists(fasta_fname)) {
stop("File not found, please check if it exists")
}
if (raw_seq) {
givenSeqs <- Biostrings::readDNAStringSet(
filepath = fasta_fname, format = "fasta", use.names = TRUE)
givenSeqs <- Biostrings::DNAStringSet(toupper(givenSeqs))
return(givenSeqs)
} else {
#
givenSeqs <- Biostrings::readDNAStringSet(
filepath = fasta_fname, format = "fasta", use.names = FALSE)
givenSeqs <- Biostrings::DNAStringSet(toupper(givenSeqs))
.assert_seq_attributes(givenSeqs)
.msg_pstr("Read ", length(givenSeqs), " sequences", flg=TRUE)
#
oheSeqs <- get_one_hot_encoded_seqs(givenSeqs,
sinuc_or_dinuc = sinuc_or_dinuc)
return(oheSeqs)
}
}
## =============================================================================
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.