Nothing
#' Swing statistic
#'
#' @description This function integrates the kinase-substrate predictions,
#' directionality of phosphopeptide fold change and signficance to assess
#' local connectivity (swing) of kinase-substrate networks. The final score
#' is a normalised and weighted score of predicted kinase activity. If
#' permutations are selected, network node:edges are permutated. P-values will
#' be calculated for both ends of the distribution of swing scores (positive and
#' negative swing scores).
#' @param input_data A data.frame of phoshopeptide data. Must contain 4 columns
#' and the following format must be adhered to. Column 1 - Annotation, Column 2
#' - centered peptide sequence, Column 3 - Fold Change [-ve to +ve], Column 4
#' - p-value [0-1]. This must be the same dataframe used in scoreSequences()
#' @param pwm_in List of PWMs created using buildPWM()
#' @param pwm_scores List of PWM-substrate scores created using
#' scoreSequences()
#' @param pseudo_count Pseudo-count acts at two levels. 1) It adds a small
#' number to the counts to avoid zero divisions, which also 2) avoids log-zero
#' transformations. Note that this means that pos, neg and all values in the
#' output table include the addition of the pseudo-count. Default: "1"
#' @param p_cut_pwm Significance level for determining a significant
#' kinase-substrate enrichment. Default: "0.05"
#' @param p_cut_fc Significance level for determining a significant level of
#' Fold-change in the phosphoproteomics data. Default: "0.05"
#' @param permutations Number of permutations to perform. This will shuffle the
#' kinase-subtrate edges of the network n times. To not perform permutations and
#' only generate the scores, set permutations=1 or permutations=FALSE. Default:
#' "1000"
#' @param return_network Option to return an interaction network for visualising
#' in cystoscape. Default = FALSE
#' @param verbose Turn verbosity on/off. To turn on, verbose=TRUE. Options are:
#' "TRUE, FALSE". Default=FALSE
#'
#' @examples
#' ## import data
#' data(example_phosphoproteome)
#' data(phosphositeplus_human)
#'
#' ## clean up the annotations
#' ## sample 100 data points for demonstration
#' sample_data <- head(example_phosphoproteome, 100)
#' annotated_data <- cleanAnnotation(input_data = sample_data)
#'
#' ## build the PWM models:
#' set.seed(1234)
#' sample_pwm <- phosphositeplus_human[sample(nrow(phosphositeplus_human),
#' 1000),]
#' pwms <- buildPWM(sample_pwm)
#'
#' ## score the PWM - substrate matches
#' ## Using a "random" background, to calculate the p-value of the matches
#' ## Using n = 100 for demonstration
#' ## set.seed for reproducibility
#' set.seed(1234)
#' substrate_scores <- scoreSequences(input_data = annotated_data,
#' pwm_in = pwms,
#' background = "random",
#' n = 100)
#'
#' ## Use substrate_scores and annotated_data data to predict kinase activity.
#' ## This will permute the network node and edges 10 times for demonstration.
#' ## set.seed for reproducibility
#' set.seed(1234)
#' swing_output <- swing(input_data = annotated_data,
#' pwm_in = pwms,
#' pwm_scores = substrate_scores,
#' permutations = 10)
#'
#' @return A data.table of swing scores
#'
#' @export swing
#' @importFrom BiocParallel bplapply
#' @importFrom BiocParallel MulticoreParam
#' @importFrom stats setNames
#' @importFrom stats sd
#' @importFrom data.table melt.data.table
swing <-
function(input_data = NULL,
pwm_in = NULL,
pwm_scores = NULL,
pseudo_count = 1,
p_cut_pwm = 0.05,
p_cut_fc = 0.05,
permutations = 1000,
return_network = FALSE,
verbose = FALSE) {
#----------------------------------------------
#format checks:
if (is.null(input_data))
stop("input_data not provided; you must provide an input table")
if (!is.data.frame(input_data))
stop("input_data is not a data.frame; you must provide an input table")
if (is.null(pwm_in))
stop(
"pwm_in not provided; you must provide an input table containing
computed position weight matrices using buildPWM()"
)
if (!is.list(pwm_in))
stop(
"pwm_in is not a list format; something has gone wrong. Make sure you
compute the position weight matrices using buildPWM()"
)
if (is.null(pwm_scores))
stop(
"pwm_scores not provided; you must provide an input table containing
computed scores using scoreSequences()"
)
if (!is.list(pwm_scores))
stop(
"pwm_scores is not a list format; something has gone wrong. Make sure
PWM-substrate matches are scored using scoreSequences()"
)
if (p_cut_pwm >= 1)
stop("p_cut_pwm needs to be less than 1; make sure your p-values are not
log transformed")
if (p_cut_fc >= 1)
stop("p_cut_fc needs to be less than 1; make sure your p-values are not
log transformed")
if (permutations != FALSE &
(!is.numeric(permutations) | permutations < 1))
stop(
"permutations needs to be a numerical number. Either 1 or FALSE (for no
permutation) or greater than 1 to set the number of pemutations"
)
#----------------------------------------------
# 1. binarise p-values and fold change
if (verbose) {
start_time <- Sys.time()
message("Start: ", start_time)
message("[Step1/3] : Calculating Swing Scores")
}
pwm_pval <- pwm_scores$peptide_p
# binarise p-values
pwm_pval[, 3:ncol(pwm_pval)] <-
ifelse(pwm_pval[, 3:ncol(pwm_pval)] > p_cut_pwm, 0, 1)
# binarise FC
input_data[, 3] <- ifelse(as.numeric(input_data[, 3]) > 0, 1, -1)
# binarise p-value
input_data[, 4] <-
ifelse(as.numeric(input_data[, 4]) > p_cut_fc, 0, 1)
# compute Sipk statistic (Sipk - see paper):
input_data$Sipk <-
as.numeric(input_data[, 3]) * as.numeric(input_data[, 4])
# 2. merge tables, summarise counts and unique
input_data$annotation <-
paste(input_data$annotation, input_data$peptide, sep = "::")
pwm_pval$annotation <-
paste(pwm_pval$annotation, pwm_pval$peptide, sep = "::")
data_merge <-
unique(merge(input_data, pwm_pval, by = "annotation"))
# 3. Swing scores of REAL data.
swing_out <- swingScore(
data_merge = data_merge,
pwm_in = pwm_in,
permute = FALSE,
pseudo_count = pseudo_count
)
# 4. permute and calculate p-values
if (permutations != FALSE && permutations > 1) {
if (verbose) {
message("[Step2/3] : Permuting Network")
}
# obtain permuted column names
swing_names <- colnames(data_merge)[7:ncol(data_merge)]
n_permute <- lapply(seq_len(permutations), function(i)
sample(as.character(swing_names), length(swing_names),replace = FALSE))
names(n_permute) <- paste("rand", seq_len(permutations), sep = "")
# calculate swing scores (with permuted names) and return as vector
swing_permute <- list(bplapply(seq_len(permutations), function(i)
swingScore(
data_merge = setNames(data.frame(data_merge),
c(colnames(data_merge)[1:6],
n_permute[[i]])),
pwm_in = pwm_in,
permute = TRUE,
pseudo_count = pseudo_count,
n = i
)))
# returns ordered dataframe; order names and merge
swing_permute_names <- swing_names[order(swing_names)]
swing_permute <- data.frame("kinase" = swing_permute_names,
swing_permute)
colnames(swing_permute) <- c("kinase", names(n_permute))
if (verbose) {
message("[Step3/3] : Calculating p-values")
}
# obtain p-values, two sided
swing_out$p_greater <-
unlist(bplapply(seq_len(nrow(swing_out)), function(i)
(sum(
ifelse(
as.numeric(swing_permute[swing_permute$kinase ==
swing_out$kinase[i], ][2:ncol(swing_permute)]) >
as.numeric(swing_out$swing_raw[i]), 1, 0),
na.rm = TRUE) + 1)
/ (as.numeric(permutations) + 1)))
swing_out$p_less <-
unlist(bplapply(seq_len(nrow(swing_out)), function(i)
(sum(
ifelse(
as.numeric(swing_permute[swing_permute$kinase ==
swing_out$kinase[i], ][2:ncol(swing_permute)]) <
as.numeric(swing_out$swing_raw[i]), 1, 0
), na.rm = TRUE) + 1)
/ (as.numeric(permutations) + 1)))
# order by p-value
swing_out <- swing_out[order(swing_out$p_greater),]
}
if (verbose) {
message("[FINISHED]\n")
end_time <- Sys.time() - start_time
message("Finish: ", Sys.time())
}
# set option to return a network
network <- NULL
if (return_network == "TRUE"){
network <- data.table::melt(data_merge,
id.vars = c("annotation"),
measure.vars =
colnames(data_merge[7:ncol(data_merge)]))
# keep "significant" edges
network <- network[network$value == 1, ]
network <- data.frame(
"source" = as.character(network$variable),
"target" = as.character(network$annotation)
)
}
return(list("scores" = swing_out, "network" = network))
}
# describeIn swing This helper function performs the swing score calculation
# no verbose in this helper
# permute - for setting right table format (using data.table) for merging.
swingScore <-
function(data_merge,
pwm_in,
pseudo_count = 1,
permute = FALSE,
n = 1) {
#----------------------------------------------
data_merge <- data_merge$Sipk * data_merge[7:ncol(data_merge)]
# propogation of NaN here due to small number division, add psuedo-count
# count significant positive
p_k <- colSums(data_merge == 1, na.rm = TRUE) + pseudo_count
# count significant negative
n_k <- colSums(data_merge == -1, na.rm = TRUE) + pseudo_count
#all counts (both positive and negative).
t_k <- p_k + n_k
#----------------------------------------------
# compute swing statistics
pk_ratio <- (p_k)/(t_k)
nk_ratio <- (n_k)/(t_k)
swing_raw <- (pk_ratio) / (nk_ratio)
p_n_all <-
data.frame(
"kinase" = colnames(data_merge),
"pos" = p_k,
"neg" = n_k,
"all" = t_k,
"pk" = pk_ratio,
"nk" = nk_ratio,
"swing_raw" = swing_raw
)
# include the count numbers:
p_n_all <- merge(p_n_all, pwm_in$kinase, by = "kinase", sort = TRUE)
# weighted by number of substrates
# not possible to have zero - must have substrates to build model
p_n_all$swing_raw <-
log2(p_n_all$swing_raw) * log2(p_n_all$n)
# weighted by size of kinase-substrate network:
p_n_all$swing_raw <-
p_n_all$swing_raw * log2(p_n_all$all)
# z-score transform
p_n_all$swing <-
(p_n_all$swing_raw - mean(p_n_all$swing_raw, na.rm = TRUE)) /
sd(p_n_all$swing_raw, na.rm = TRUE)
if (permute == "TRUE") {
p_n_all <- p_n_all$swing_raw
}
return(p_n_all)
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.