# HELPER FUNCTIONS FOR PARALLEL SIGNALP:
#' split XStringSet objects
#'
#' This function splits large XStringSet objects into chunks of given size and
#' returns a list of AAStringSet objects.
#' @param string_set input AAStringSet object;
#' @param chunk_size the number of sequenses in a single chunk;
#' @return list of AAStringSet chunks.
#' @export
#' @examples
#' # Read fasta file:
#' aa <- readAAStringSet(system.file("extdata", "sample_prot_100.fasta",
#' package = "SecretSanta"))
#' # Split it into chunks
#' # with 10 sequences each:
#' split_XStringSet(aa,10)
split_XStringSet <- function(string_set, chunk_size) {
if (!(is(string_set, 'XStringSet'))) {
stop('Input string_set does not belong to XStringSet class.')
}
lss <- length(string_set)
if (chunk_size > lss) {stop('Chunk size exceeds total seq number.')}
total_seq <- c(1:lss)
chunks <- split(total_seq,
ceiling(seq_along(total_seq) / chunk_size))
lapply(chunks, function(x) string_set[x])
}
#' combine multiple objects of SignalpResult class
#'
#' This function combines multiple instances of SignalpResult class,
#' typically generated with parLapply while running signalp predictions in
#' parallel mode.
#' @param arguments a list of SignalpResult objects to be combined
#' @export
#' @return SignalpResult object
#' @examples
#' aa <- readAAStringSet(system.file(
#' "extdata", "sample_prot_100.fasta", package = "SecretSanta"))
#' inp2 <- CBSResult(in_fasta = aa[1:10])
#' inp3 <- CBSResult(in_fasta = aa[20:30])
#' inp4 <- CBSResult(in_fasta = aa[40:50])
#' r1 <- signalp(input_obj = inp2, version = 4, organism = 'euk',
#' run_mode = 'starter')
#' r2 <- signalp(input_obj = inp3, version = 4, organism = 'euk',
#' run_mode = 'starter')
#' r3 <- signalp(input_obj = inp4, version = 4, organism = 'euk',
#' run_mode = 'starter')
#' obj <- list(r1, r2, r3)
#' combined_sp <- combine_SpResult(obj)
combine_SpResult <- function(arguments) {
if ((all(sapply(arguments, is, 'SignalpResult'))) == FALSE) {
stop('Some objects from argument list do not belong to SignalpResult class.')
}
if (length(unique(sapply(arguments, getSPversion))) != 1) {
stop('Supplied objects were generated with different versions of SignalP.')
}
# combine accesors for fasta slots in a list
fasta_getters <- list(getInfasta, getOutfasta, getMatfasta)
# apply the list with fasta getters to all the elements in the argument list
z <- Map(function(x) sapply(fasta_getters, function(f) f(x)), arguments)
# combine fasta fields by slot types
got_fastas <- sapply(1:3,
function(i) do.call(c, sapply(z, function(x) x[i])))
# arrange combined slots in SignalpResult object:
c_obj <- SignalpResult(
mature_fasta = got_fastas[[3]],
sp_tibble = do.call(rbind, (lapply(arguments, getSPtibble))),
sp_version = getSPversion(arguments[[1]])
)
c_obj <- setInfasta(c_obj, in_fasta = got_fastas[[1]])
c_obj <- setOutfasta(c_obj, out_fasta = got_fastas[[2]])
}
# PARALLEL SIGNALP ITSELF:
#' predict signal peptides
#'
#' This function calls the command line tool \code{signalp} to predict the presence and location of
#' signal peptide cleavage sites in amino acid sequences.
#' \cr
#' \cr
#' Large input files (>500 sequnces) are automatically split into smaller chunks
#' so that \code{signalp} prediction can be run as an embarassingly parallel process
#' on a specified number of cores.
#' @param input_obj an instance of class \code{CBSResult} containing protein
#' sequences as one of the attributes
#' @param version version of \code{signalp}, supported versions include: \cr
#' \code{signalp2}, \code{signalp3}, \code{signalp4}.
#' @param organism a character string with the following options:
#' \itemize{
#' \item \code{organism = "euk"} - for eukaryotes
#' \item \code{organism = "gram+"} - for gram-positive bacteria
#' \item \code{organism = "gram-"} - for gram-negative bacteria
#' }
#' @param run_mode a character string with the following options:
#' \itemize{
#' \item \code{run_mode = "starter"} - if it is the first step in pipeline
#' \item \code{run_mode = "piper"} - if you run this function on the output of other CBS tools
#' }
#' @param paths if required version of \code{signalp} is not acessible globally, a file
#' conatining a full path to it's executable should be provided; for details
#' please check SecretSanta vignette.
#' @param truncate a logical indicating:
#' \itemize{
#' \item \code{truncate = TRUE} - sequences longer 2000 residues will be
#' truncated to this length limit and renamed
#' \item \code{truncate = FALSE} - long sequences will be excluded from the analysis
#' }
#' Default is \code{truncate = TRUE}.
#' @param cores number of cores for multicore processing. Default is \code{cores = 1}.
#' @param legacy_method optional argument, which prediction method to use when running SiganlP 2.0 and SignalP 3.0:
#' \itemize{
#' \item \code{legacy_method = "hmm"} - for HMM-based predictions
#' \item \code{legacy_method = "nn"} - for prediction based on neural networks
#' }
#' @param sensitive optional argument, a logical indicating:
#' \itemize{
#' \item \code{sensitive = TRUE} - if SignalP version 4.1 is used, it will be run in sensitive mode ().
#' \item \code{sensitive = FALSE} - if SignalP version 4.1 is used, it will be run with the default cut-off.
#' }
#' Default is \code{sensitive = FALSE}. For more details about SignalP 4.1 sensitive mode please see \url{http://www.cbs.dtu.dk/services/SignalP/performance.php}
#' @return an object of SignalpResult class
#' @export
#' @examples
#' # read fasta file in AAStringSet object
#' aa <- readAAStringSet(system.file("extdata", "sample_prot_100.fasta",
#' package = "SecretSanta"))
#' # assign this object to the input_fasta slot
#' # of empty CBSResult object
#' inp <- CBSResult(in_fasta = aa[1:10])
#' # run signalp2 on the initial file:
#' r1 <- signalp(inp, version = 2, organism = 'euk', run_mode = "starter",
#' legacy_method = 'hmm')
#' r4 <- signalp(inp, version = 4, organism = 'euk', run_mode = "starter")
#' r4_sensitive <- signalp(inp, version = 4.1, organism = 'euk', run_mode = 'starter')
#' @seealso \code{\link{parse_signalp}}
signalp <- function(input_obj,
version,
organism = c('euk', 'gram+', 'gram-'),
run_mode = c('starter', 'piper'),
paths = NULL,
truncate = NULL,
cores = 1,
sensitive = FALSE,
legacy_method) {
if (!is.numeric(cores))
stop('Cores argument must be numeric.', call. = FALSE)
if (parallel::detectCores() < cores)
stop(
"Your machine has only ",
parallel::detectCores() ,
" cores ... you specified: ",
cores,
" cores. Please fix.",
call. = FALSE
)
# ----- Check that inputs are valid
# arguments are present and have valid values:
if (missing(organism)) {
stop('Missing argument: organism.', call. = FALSE)
}
if (!is.element(organism, c("euk", "gram+", "gram-")))
stop("Please specify an organism that is supported by this function.", call. = FALSE)
if (missing(run_mode)) {
stop('Missing argument: run_mode.', call. = FALSE)
}
if (!is.element(run_mode, c("starter", "piper")))
stop("Please specify a run_mode that is supported by this function.", call. = FALSE)
organism <- match.arg(organism)
run_mode <- match.arg(run_mode)
# check that input object belongs to CBSResult class
if (is(input_obj, "CBSResult")) {
} else {
stop('Input_object does not belong to CBSResult superclass.')
}
# check that input_object contains non-empty in/out_fasta for starter/piper
if (run_mode == 'starter') {
if (length(getInfasta(input_obj)) != 0) {
fasta <- getInfasta(input_obj)
} else {
stop('in_fasta attribute is empty.', call. = FALSE)
}
} else if (run_mode == 'piper') {
if (length(getOutfasta(input_obj)) != 0) {
fasta <- getOutfasta(input_obj)
} else {
stop('out_fasta attribute is empty.', call. = FALSE)
}
}
# check that version number is valid:
if (is.element(version, c(2, 3, 4, 4.1))) {
version <- round(version) # versions 4.1 and 4 produce the same out format
} else {
stop('Sersion is invalid, allowed versions: c(2, 3, 4, 4.1)', call. = FALSE)
}
# check that legacy_method is provided for versions 2 and 3:
if (all(version < 4, missing(legacy_method))) {
stop('Missing argument: legacy_method.')
}
# check for extra arguments
if (all(version > 3, !missing(legacy_method))) {
message('version > 3, legacy_method is not required.')
}
# ----- Set default value for parameters if not provided:
if (is.null(truncate))
truncate <- TRUE
else
truncate
if (!is.logical(truncate))
stop("Truncate can only be TRUE or FALSE.", call. = FALSE)
# ------ Produce encouraging status messages, inputs should be ok.
# create actual tool name with version number provided
signalp_version <- paste("signalp", version, sep = '')
message(paste('Version used ...', signalp_version))
message("Running signalp locally ...")
# simple signalp, takes single AAStringSet as an input and runs
# signalp prediction on it
simple_signalp <- function(aaSet) {
# ---- Run prediction
# convert fasta to a temporary file:
out_tmp <- tempfile() #create a temporary file for fasta
writeXStringSet(aaSet, out_tmp) #write tmp fasta file to use later
message(paste('Submitted sequences...', length(aaSet)))
# get and check paths to signalp
if (is.null(paths)) {
full_pa <- signalp_version
} else {
mp <- suppressMessages(manage_paths(
in_path = FALSE,
test_tool = signalp_version,
path_file = paths))
full_pa <- mp$path_tibble$path
}
# decide how to run signalp depending on a version provided
if (version >= 4) {
# runing signalp versios 4 and 4.1, potentially should work for 5
# check actual version of the tool used:
sp <-
tibble::as.tibble(read.table(text = (system(
paste(full_pa, "-t", organism, out_tmp),
intern = TRUE
))))
names(sp) <- c("gene_id", "Cmax", "Cpos",
"Ymax", "Ypos", "Smax",
"Spos", "Smean", "D",
"Prediction", "Dmaxcut",
"Networks-used")
# here we need to update prediction if sensitive mode is requested
verify_version <- system(paste(full_pa, '-V'), intern = TRUE)
if (all(verify_version != 'signalp 4.1', sensitive == TRUE)){
stop('Sensitive mode is enabled for SignalP 4.1 version only')
}
if (verify_version == 'signalp 4.1') {
# check if sensitive param is correct and provided
if (!is.logical(sensitive)) {
stop('Sensitive must be a logical.')
}
if (sensitive == TRUE) {
message('Using sensitive method...')
# update cutoff scores
if (organism == 'euk') D_cutoff = 0.34
if (organism == 'gram+') D_cutoff = 0.42
if (organism == 'gram-') D_cutoff == 0.42
# transform table output by cutoff: update Prediction column
sp <- sp %>% dplyr::mutate(Prediction = case_when(
D >= D_cutoff ~ 'Y',
D < D_cutoff ~ 'N'))
}
}
# patch ends
# reorder columns to match sp2/3 output:
sp <- sp[c("gene_id", "Cmax", "Cpos",
"Ymax", "Ypos", "Smax",
"Spos", "Smean", "Prediction")]
sp <- sp[sp$Prediction == 'Y',]
sp$Prediction <- ifelse(sp$Prediction == 'Y', 'Signal peptide', "none")
} else if (version < 4) {
# running signalp versions 2 and 3, call parse_signalp
message('signalp < 4, calling parser for the output ...')
con <-
system(paste(full_pa, "-t", organism, "-f short -trunc 70",
"-m", legacy_method, out_tmp), intern = TRUE)
sp <- parse_signalp(input = con,
input_type = "system_call",
version = version,
method = legacy_method,
source_fasta = out_tmp,
pred_filter = "Signal peptide")
}
message(paste('Candidate sequences with signal peptides ...', nrow(sp)))
if (nrow(sp) == 0) {
warning('Signal peptide prediction yeilded 0 candidates ... ', call. = FALSE)
}
# generate cropped names for input fasta
names(aaSet) <- unname(sapply(names(aaSet), crop_names))
# get ids of candidate secreted proteins
out_fasta_sp <- aaSet[sp$gene_id]
# generate mature sequences
cropped_fasta <- subseq(out_fasta_sp, start = sp$Cpos, end = -1)
# construct output object
out_obj <- SignalpResult(mature_fasta = cropped_fasta,
sp_version = version,
sp_tibble = sp)
out_obj <- setInfasta(out_obj, in_fasta = aaSet)
out_obj <- setOutfasta(out_obj, out_fasta = out_fasta_sp)
# check that intended output is valid
if (validObject(out_obj)) {
return(out_obj)
}
} #close simple_fasta
# Handle long sequences if any present in the input,
# if necessary - run signalp as a parallel process
# estimate how big is the file, if required - split it into smaller
# chunks and run signalp as an embarassingly parallel process
fasta <- truncate_seq(truncate = truncate, fasta, 2000)
if (length(fasta) <= 600) {
message('Ok for single processing.')
return(simple_signalp(estimate_lim(fasta, truncate)))
} else {
message('Input fasta contains >600 sequences, entering batch mode.')
#split and check that chunks do not exceed 200K residue limit
split_fasta <-
sapply(split_XStringSet(fasta, 500), estimate_lim, truncate)
# Initiate cluster
cl <- parallel::makeCluster(cores)
# run parallel process
parallel::clusterExport(cl = cl, varlist = c("paths"), envir = environment())
result <- parallel::parLapply(cl, split_fasta, simple_signalp)
parallel::stopCluster(cl)
res_comb <- do.call(c, result)
combined_SignalpResult <- combine_SpResult(unname(res_comb))
sp_count <- nrow(getSPtibble(combined_SignalpResult))
message(paste('Candidate sequences with signal peptides ...', sp_count))
if (sp_count == 0) {
warning('Signal peptide prediction yeilded 0 candidates ...', call. = FALSE)
}
closeAllConnections()
return(combined_SignalpResult)
}
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.