#' Read and tidy a FASTA file
#'
#' reads and tidys FASTA file.
#'
#' @export
#' @importFrom Biostrings readAAStringSet
#' @importFrom dplyr left_join
#' @import tidyverse
#' @import tibble
#' @param path a string of path pointing towards a fasta file
#' @return a `tibble` of formatted FASTA information
#' @examples
#' tidyFasta(system.file("extdata", "O13297.fasta", package="MSstatsLiP"))
tidyFasta <- function(path) {
# Check input
if (missing(path))
stop(paste0("The input ", sQuote("path"), " is missing"))
if (!is.character(path))
stop("Provide the path to the FASTA file as a string")
if (length(path) != 1)
stop("Provide only one path to the FASTA file at a time")
# if (!file.exists(path))
# stop(paste0("The file ", sQuote(path), " does not exist"))
aa <- readAAStringSet(path)
aa <- as.character(aa) # named vector
header <- names(aa)
pat_ac <- paste0(
"([OPQ][0-9][A-Z0-9]{3}[0-9]|[A-NR-Z][0-9]",
"([A-Z][A-Z0-9]{2}[0-9]){1,2})"
)
pat_iso <- paste0(
"([OPQ][0-9][A-Z0-9]{3}[0-9]|[A-NR-Z][0-9]",
"([A-Z][A-Z0-9]{2}[0-9]){1,2})([-]\\d{1,}){0,1}"
)
mch_head <- regexpr(
pattern = "([^\\s]*)(?=\\s)", text = header, perl = TRUE
)
shead <- regmatches(header, m = mch_head)
mch_uniprot <- regexpr(pattern = pat_ac, text = shead)
matched <- mch_uniprot != (-1)
ac <- regmatches(shead, regexpr(pattern = pat_ac, text = shead))
iso <- regmatches(shead, regexpr(pattern = pat_iso, text = shead))
trimmed <- sub("^(sp|tr)(?=\\|)", "", shead, perl = TRUE)
trimmed <- sub(pat_iso, "", trimmed)
entry <- gsub("\\|", "", trimmed)
tibble(
header = header[matched], sequence = as.vector(aa[matched]),
uniprot_ac = ac, uniprot_iso = iso, entry_name = entry[matched]
)
}
#' Annotate modified sites with associated peptides
#'
#' \code{PTMlocate} annotates modified sites with associated peptides.
#'
#' @param peptide A string vector of peptide sequences. The peptide sequence
#' does not include its preceding and following AAs.
#' @param uniprot A string vector of Uniprot identifiers of the peptides'
#' originating proteins. UniProtKB entry isoform sequence is used.
#' @param fasta A tibble with FASTA information. Output of \code{tidyFasta}.
#' @param modResidue A string. Modifiable amino acid residues.
#' @param modSymbol A string. Symbol of a modified site.
#' @param rmConfound A logical. \code{TRUE} removes confounded
#' unmodified sites, \code{FALSE} otherwise. Default is \code{FALSE}.
#'
#' @return A data frame with three columns: \code{uniprot_iso}, \code{peptide},
#' \code{site}.
#'
#' @examples
#' fasta <- tidyFasta(system.file("extdata", "O13297.fasta", package="MSstatsLiP"))
#' locatePTM("DRVSYIHNDSC*TR", "O13297", fasta, "C", "\\*")
#'
#' @export
locatePTM <- function(peptide, uniprot, fasta, modResidue, modSymbol,
rmConfound=FALSE) {
if (!.locateCheck(peptide, uniprot, fasta, modResidue, modSymbol))
stop("Input checking fails!")
peptide_seq <- tibble(uniprot_iso = uniprot, peptide = peptide)
peptide_seq$is_mod <- grepl(modSymbol, peptide_seq$peptide)
peptide_seq$peptide_unmod <- gsub(modSymbol, "", peptide_seq$peptide)
# Locate modifiable sites
col_seq <- c("uniprot_iso", "sequence")
sub_fasta <- fasta[fasta$uniprot_iso %in% uniprot, col_seq]
loc <- gregexpr(modResidue, sub_fasta$sequence, fixed = TRUE)
sub_fasta$idx_site_full <- lapply(loc, .location_start)
# Locate peptides (use extended AAs for specific matching when possible)
peptide_fasta <- dplyr::left_join(peptide_seq, sub_fasta)
loc <- Map(function(p, s) gregexpr(p, s, fixed = TRUE)[[1]],
as.list(peptide_fasta$peptide_unmod),
as.list(peptide_fasta$sequence))
n <- vapply(loc, .num_match, FUN.VALUE = integer(1))
peptide_fasta <- peptide_fasta[n == 1L, ]
loc <- loc[n == 1L]
peptide_fasta$idx_peptide <- lapply(loc, .location)
peptide_fasta$aa_start <- vapply(loc, .location_start, numeric(1))
# Pattern of modified sites
mod_pattern <- paste0(modResidue, modSymbol)
# Locate modifiable, modified sites (AA residues) associated with peptides
peptide_fasta$idx_site <- Map(.covered_set,
peptide_fasta$idx_site_full,
peptide_fasta$idx_peptide)
peptide_fasta$idx_mod <- Map(
function(p, a) locateMod(p, a, residueSymbol = mod_pattern),
as.list(peptide_fasta$peptide), as.list(peptide_fasta$aa_start)
)
peptide_fasta$mod_aa <- Map(
function(s, i) if (length(i) == 0) character() else substring(s, i, i),
as.list(peptide_fasta$sequence), peptide_fasta$idx_mod
)
# Annotate modified sites
peptide_fasta$len_site <- lapply(
peptide_fasta$idx_site_full, function(x) nchar(x[length(x)])
)
peptide_fasta$site <- Map(annotSite,
peptide_fasta$idx_mod, peptide_fasta$mod_aa,
peptide_fasta$len_site)
peptide_fasta$site <- as.character(peptide_fasta$site)
col_fasta <- c("uniprot_iso", "peptide", "peptide_unmod", "is_mod",
"idx_site", "idx_mod", "site")
peptide_fasta <- peptide_fasta[, col_fasta]
.rmConfounded(peptide_fasta, rmConfound)
}
#' @noRd
#' @importFrom dplyr group_by mutate ungroup bind_rows
.rmConfounded <- function(peptideFasta, rmConfound) {
# Handle confounded unmodified sites
col_res <- c("uniprot_iso", "peptide", "site")
if (rmConfound) {
by_prot <- group_by(peptideFasta, .data$uniprot_iso)
by_prot <- mutate(
by_prot, idx_mod_full = list(unique(unlist(.data$idx_mod)))
)
by_prot <- ungroup(by_prot)
by_prot <- by_prot[!by_prot$is_mod, ]
cnfnd <- vector("logical", nrow(by_prot))
for (i in seq_along(cnfnd)) {
cnfnds <- by_prot$idx_site[[i]] %in% by_prot$idx_mod_full[[i]]
cnfnd[i] <- any(cnfnds)
}
s_unmod <- by_prot[!cnfnd, col_res]
n1 <- vapply(peptideFasta$idx_mod, .length_one, FUN.VALUE = logical(1))
s_mod <- peptideFasta[peptideFasta$is_mod & n1, col_res]
res <- bind_rows(s_mod, s_unmod)
} else {
n1 <- vapply(peptideFasta$idx_mod, .length_one, FUN.VALUE = logical(1))
res <- peptideFasta[!peptideFasta$is_mod | n1, col_res]
}
res
}
#' @noRd
.num_match <- function(x) {
ifelse(
is.na(x) || identical(as.vector(x), -1L),
0L, length(attr(x, "match.length"))
)
}
#' @noRd
.location_start <- function(x) {
start <- as.vector(x)
if (identical(start, -1L)) {
return(integer())
}
start
}
#' @noRd
.location <- function(x) {
start <- as.vector(x)
if (identical(start, -1L)) {
return(cbind(start = integer(), end = integer()))
}
end <- as.vector(x) + attr(x, "match.length") - 1
cbind(start = start, end = end)
}
#' @noRd
.covered_set <- function(x, y) {
x[x >= y[1] & x <= y[2]]
}
#' @noRd
.length_one <- function(x) length(x) == 1
#' Locate modified sites with a peptide
#'
#' \code{locateMod} locates modified sites with a peptide.
#'
#' @param peptide A string. Peptide sequence.
#' @param aaStart An integer. Starting index of the peptide.
#' @param residueSymbol A string. Modification residue and denoted symbol.
#'
#' @return A string.
#'
#' @examples
#' locateMod("P*EP*TIDE", 3, "\\*")
#'
#' @export
locateMod <- function(peptide, aaStart, residueSymbol) {
x <- gregexpr(residueSymbol, peptide)[[1]]
start <- as.vector(x)
if (identical(start, -1L)) {
return(integer())
}
start - seq_along(start) + aaStart
}
#' Annotate modification site
#'
#' \code{annotSite} annotates modified sites as their residues and locations.
#'
#' @param aaIndex An integer vector. Location of the sites.
#' @param residue A string vector. Amino acid residue.
#' @param lenIndex An integer. Default is \code{NULL}
#'
#' @return A string.
#'
#' @examples
#' annotSite(10, "K")
#' annotSite(10, "K", 3L)
#'
#' @export
annotSite <- function(aaIndex, residue, lenIndex=NULL) {
if (length(aaIndex) == 0) {
return("None")
} else {
if (length(residue) != 1 && length(aaIndex) != length(residue))
stop("Lengths of aaIndex and residue don't match!")
if (!is.null(lenIndex)) {
if (!is.integer(lenIndex))
stop("lenIndex should be an integer!")
if (max(nchar(aaIndex)) > lenIndex)
stop("lenIndex doesn't reserve enough space for aaIndex!")
aa_idx_pad <- formatC(
aaIndex, width = lenIndex, format = "d", flag = "0"
)
site <- paste0(residue, aa_idx_pad, collapse = "-")
} else {
site <- paste0(residue, aaIndex, collapse = "-")
}
}
site
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.