#' @name findProtease
#' @title Find Proteases
#' @usage findProtease(protein, peptide, organism, start_pos, end_pos)
#' @description Given a vector of peptides and proteins,
#' finds known proteases acting on cleavage sites.
#' @param protein a vector of UniProt Accession IDs.
#' @param peptide a vector of amino acid sequences corresponding to the
#' proteins.
#' @param organism name of organism.
#' @param start_pos (optional) numeric vector of N-terminus positions in
#' protein sequence.
#' @param end_pos (optional) numeric vector of C-terminus positions in
#' protein sequence.
#'
#' @include Classes.R Generics.R Methods.R helper-functions.R
#'
#' @return S4 object Cleavages
#'
#' @examples
#' protein <- c("P02671", "P02671", "P68871", "P01011")
#' peptide <- c("FEEVSGNVSPGTR", "FVSETESR", "LLVVYPW", "ITLLSAL")
#' res <- findProtease(protein = protein,
#' peptide = peptide,
#' organism = "Homo sapiens")
#'
#' @importFrom data.table data.table
#'
#' @export
findProtease <- function(protein, peptide,
organism = "Homo sapiens",
start_pos, end_pos) {
# Define local variables as NULL
# (due to non-standard evaluation in data.table)
`Substrate (Uniprot)` <- `Substrate organism` <- NULL
`Protease (Uniprot)` <- `Protease (MEROPS)` <- `Protease organism` <- NULL
seq_name <- .N <- NULL
# Internal data: MEROPS Substrate_search.sql and
# Uniprot ID to MEROPS identifier mapping
mer <- data.table::fread(
"C:/Users/ryden/proteasy/inst/extdata/mer.tab.gz")
merops_map <- data.table::fread(
"C:/Users/ryden/proteasy/inst/extdata/merops.map.tab.gz")
unique_proteins <- unique(protein)
if(!(organism %in% unique(mer$`Substrate organism`))) {
stop("Organism not recognized")
}
if((missing(start_pos) | missing(end_pos))) {
if(!(length(peptide) == length(protein))) {
stop("Peptide and protein vectors must be the same length.")
}
# Find start_pos and end_pos by mapping
# peptide against protein sequence
if(!(organism %in% c("Homo sapiens",
"Rattus norvegicus",
"Mus musculus"))) {
# Rcpi method
p <- getSeqData(method = "Rcpi",
protein = protein,
organism = organism)
} else {
# ensembldb method
p <- getSeqData(method = "ensembldb",
protein = protein,
organism = organism)
}
# Proteins where sequence data was not found
unmapped <- unique_proteins[!(unique_proteins %in% p$seq_name)]
if(length(unmapped) == length(unique_proteins)) {
stop("No accessions could be mapped.")
}
if(length(unmapped) > 0) {
warning(paste0(
"Some protein accessions could not be mapped (",
length(unmapped),
"). This could be due to use of obsolete
accessions or incorrect identifier type. ",
paste0(unmapped, collapse = ", "))
)
}
p <- p[seq_name %in% unique_proteins,]
input <- unique(data.table::data.table(protein, peptide))
data.table::setkeyv(p, "seq_name")
data.table::setkeyv(input, "protein")
input <- p[input]
str_pos <- data.table::as.data.table(
stringr::str_locate(input$sequence, input$peptide))
input$start_pos <- as.character(str_pos$start)
input$end_pos <- as.character(str_pos$end)
names(input)[1] <- "protein"
} else {
# Find using position matching
input <- data.table::data.table(protein = protein,
sequence = NA,
peptide = peptide,
start_pos = as.character(start_pos),
end_pos = as.character(end_pos))
}
mer <- mer[`Substrate organism` == organism &
`Substrate (Uniprot)` %in% unique_proteins]
r <- matchTermini(input, mer)
r <- mapMEROPSIDs(r, merops_map)
r <- r[!duplicated(r[, c("peptide",
"protein",
"Protease (Uniprot)",
"Protease (MEROPS)",
"terminus",
"Cleavage type")])]
names(r) <- c("Protease (Uniprot)", # 1
"Protease status", # 2
"Protease organism", # 3
"Protease (MEROPS)", # 4
"Substrate (Uniprot)", # 5
"Substrate sequence", # 6
"Peptide", # 7
"Start position", # 8
"End position", # 9
"Cleaved residue", # 10
"Substrate name", # 11
"Substrate organism", # 12
"Protease name", # 13
"Cleavage type", # 14
"Cleaved terminus") # 15
substrate <- data.table::data.table("Substrate name" = r[, 11][[1]],
"Substrate (Uniprot)" = r[, 5][[1]],
"Substrate sequence" = r[, 6][[1]],
"Substrate length" = nchar(
r[, 6][[1]]),
"Peptide" = r[, 7][[1]],
"Start position" = r[, 8][[1]],
"End position" = r[, 9][[1]])
protease <- data.table::data.table("Protease name" = r[, 13][[1]],
"Protease (Uniprot)" = r[, 1][[1]],
"Protease status" = r[, 2][[1]],
"Protease (MEROPS)" = r[, 4][[1]],
"Protease URL" =
paste0("https://www.ebi.ac.uk/merops/cgi-bin/pepsum?id=", r[, 4][[1]]))
cleavage <- data.table::data.table("Substrate (Uniprot)" = r[, 5][[1]],
"Peptide" = r[, 7][[1]],
"Protease (Uniprot)" = r[, 1][[1]],
"Protease status" = r[, 2][[1]],
"Cleaved residue" = r[, 10][[1]],
"Cleaved terminus" = r[, 15][[1]],
"Cleavage type" = r[, 14][[1]])
return(
methods::new(
Class = "Cleavages",
organism = organism,
substrate = unique(substrate),
protease = unique(protease),
cleavage = cleavage
)
)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.