#' @title Perform Parameters Optimization
#' @description This function is used to optimize the critical
#' parameters of peak picking and alignment for
#' the following data processing. It utilizes the trimed data and
#' the internal instrument-specific parameters.
#' Parallel computing will be performed. The number of cores user
#' want to use could be specified.
#' @param mSet mSet object, usually generated by 'PerformROIExtraction'
#' or 'PerformDataTrimming' here.
#' @param param List, Parameters defined by 'SetPeakParam' function.
#' @param method Character, method of parameters optimization, including
#' "DoE' only. Default is "DoE". Other method
#' is under development.
#' @param ncore Numeric, CPU threads number used to perform the parallel
#' based optimization. If thers is memory issue,
#' please reduce the 'ncore' used here. For default, 2/3 CPU threads of
#' total will be used.
#' @param running.controller The resuming pipeline running controller. Optional.
#' Don't need to define by hand.
#' @export
#' @import MSnbase
#' @import progress
#' @import parallel
#' @return will a parameter object can be used for following processing
#' @author Zhiqiang Pang \email{zhiqiang.pang@mail.mcgill.ca}
#' Jeff Xia \email{jeff.xia@mcgill.ca}
#' Mcgill University
#' License: GNU GPL (>= 2)
#' @examples
#'
#' DataFiles <- dir(system.file("mzData", package = "mtbls2"), full.names = TRUE,
#' recursive = TRUE)[c(10:12, 14:16)]
#' # remove the # before running the following command lines
#' # mSet <- PerformROIExtraction(datapath = DataFiles[c(1:2)],rt.idx = 0.25,
#' # rmConts = FALSE);#'
#' # best_params <- PerformParamsOptimization(mSet, param = SetPeakParam(), ncore = 4);
PerformParamsOptimization <- function(mSet,
param= NULL,
method="DoE",
ncore=4,
running.controller=NULL){
if (is(mSet,"mSet")) {
raw_data <- mSet@rawInMemory;
} else if (is(mSet,"MSnExp")) {
raw_data <- mSet;
} else {
stop("Wrong mSet object provided !")
}
if(!exists(".SwapEnv")){
.SwapEnv <<- new.env(parent = .GlobalEnv);
.SwapEnv$.optimize_switch <- FALSE;
.SwapEnv$count_current_sample <- 0;
.SwapEnv$count_total_sample <- 120; # maximum number for on.public.web
.SwapEnv$envir <- new.env();
}
if(is.null(.SwapEnv$GaussModel)){
.SwapEnv$GaussModel <- GaussModel;
}
if(missing(param) | is.null(param)) {
param <- SetPeakParam();
}
.optimize_switch <- .SwapEnv$.optimize_switch <- TRUE;
#Build Running plan for optimization - Indentify the controller
if (is.null(running.controller)) {
c1 <- TRUE;
.running.as.plan <- FALSE;
} else {
c1 <- running.controller@others_1[["c1"]];
.running.as.plan <- TRUE;
}
MessageOutput("\nStep 1/6: Start to optimize parameters!
\nThis step may take a long time...", "\n", NULL)
start.time<-Sys.time();
if (c1){
if (missing(param)){
stop("Please provide the param with 'SetPeakParam' function !")
} else if (missing(raw_data)){
stop("Please provide the data of MSnExp format!" )
} else if (missing(method)){
method<-"DoE";
};
MessageOutput(paste0("DoE Optimization Starting Now... (",
Sys.time(),")"), "\n", NULL)
if (.on.public.web()){
ncore <- 2;
} else if (missing(ncore)){
ncore <- ceiling(detectCores()*2/3);
message("'ncore' is absent, will use 2/3 CPU threads of total!")
};
# Define the global Parallel cores for Single Core function mode
if (ncore == 1){
if (.Platform$OS.type=="unix"){
register(bpstart(MulticoreParam(ncore)))
} else {
register(bpstart(SnowParam(ncore)))
};
}
## Optimize the noise and prefilter indexes with AutoTuner
if (param[["Peak_method"]] == "centWave"){
MessageOutput("Evaluating Noise level...", "\n", NULL)
save(raw_data, file = "raw_data_noise.rda")
p2 <- tryCatch(
Noise_evaluate(raw_data),
error = function(e) {
e
}
)
MessageOutput(paste0("ppm is estimated as : ", p2$ppm), "\n")
if (is(p2,"simpleError")) {
print_mes_tmp <-
paste0("\n",
"<font color=\"red\">",
"ERROR:",
p2$message,
"</font>",
"\n")
print_mes_tmp2 <-
paste0(
"<font color=\"orange\">",
"Don't worry: Will use default noise parameters instead!",
"</font>"
)
print_mes <- paste0(print_mes_tmp, print_mes_tmp2)
MessageOutput(print_mes, "\n", NULL)
} else {
MessageOutput("Done!", "\n", NULL)
}
if(is(p2,"simpleError")){
#param[["ppm"]]<-15;
param[["noise"]]<-100;
param[["prefilter"]]<-3;
param[["value_of_prefilter"]]<-10;
} else {
if(!is.nan(p2$ppm) & !is.infinite(p2$ppm)){
p2[["ppm"]]<-round(p2$ppm,2);
} else {
p2[["ppm"]] <- param[["ppm"]];
}
if(!is.nan(p2$noise) & !is.infinite(p2$noise)){
p2[["noise"]]<-round(p2$noise,2);
} else {
p2[["noise"]] <- 100;
}
if(!is.nan(p2$prefilter) &
!is.infinite(p2$prefilter)){
p2[["prefilter"]]<-round(p2$prefilter,2);
} else {
p2[["prefilter"]] <- 3;
}
if(!is.nan(p2$value_of_prefilter) &
!is.infinite(p2$value_of_prefilter)){
p2[["value_of_prefilter"]]<-round(p2$value_of_prefilter,2);
} else {
p2[["value_of_prefilter"]] <- 10;
}
param[["ppm"]]<-round(p2$ppm,2);
param[["noise"]]<-round(p2$noise,2);
param[["prefilter"]]<-round(p2$prefilter,2);
param[["value_of_prefilter"]]<-round(p2$value_of_prefilter,2);
if(round(p2$ppm,2) < 1 | round(p2$ppm,2) > 100){
param[["ppm"]] <- param[["ppm"]]; # Estimation failure, use the default instead !
} else {
param[["ppm"]] <- round(p2$ppm,2);
}
if(round(p2$prefilter,2) < 2){
param[["prefilter"]] <- 3; # Estimation failure, use the default instead !
} else {
param[["prefilter"]] <- round(p2$prefilter,2);
}
if(round(p2$value_of_prefilter,2) < 10){
param[["value_of_prefilter"]] <- 10;
# Estimation failure, use the default instead !
} else {
param[["value_of_prefilter"]] <- round(p2$value_of_prefilter,2);
}
if(round(p2$noise,2) < 0){
param[["noise"]] <- 100; # Estimation failure, use the default instead !
} else {
param[["noise"]] <- round(p2$noise,2);
}
}
MessageOutput(NULL, NULL, 5.00);
}
if (method=="DoE"){
p1 <- optimize.xcms.doe(raw_data,param=param,ncore=ncore);
};
if(.running.as.plan){
cache.save(p1, funpartnm= "others_c1");
marker_record("others_c1");
}
} else {
p1 <- cache.read ("others","c1");
marker_record("others_c1");
}
if (.on.public.web()){
MessageOutput(NULL, NULL, 20.00);
if (is(p1,"simpleError")) {
print_mes <-
paste0("<font color=\"red\">",
"\nERROR:",
p1$message,
"</font>",
"\n")
print_mes_tmp2 <-
paste0(
"<font color=\"orange\">",
"ADVICE: Please follow the methods above to correct
or use the default instead!",
"</font>"
)
print_mes <- paste0(print_mes, print_mes_tmp2)
MessageOutput(print_mes, "\n", NULL)
stop("EXCEPTION POINT CODE: PO2")
}
} else {
end.time<-Sys.time();
message("Time Spent In Total:",round((as.numeric(end.time) -
as.numeric(start.time))/60, 1),"mins");
}
best_params <- p1;
if(.on.public.web()) {
save(best_params, file = "best_params.rda");
return(best_params);
} else {
return(best_params);
}
}
#' @title Overall Funtion for DoE
#' @description This function is the overall function to handle the
#' starting of the optimization process and
#' pre-define the parameters' range according to the input of the
#' parameters for following Design of Experiment (DoE).
#' @param raw_data MSnExp object, The trimmed or original data input
#' for optimization.
#' @param param List, the parameters lists set by 'SetPeakParam' function.
#' The noise, prefilter and ppm values should
#' be defined by AutoTuner in the previous steps.
#' @param ncore Numeric, core number used to perform the parallel
#' based optimization.
#' @noRd
#' @import MSnbase
#' @import progress
#' @import parallel
#' @author Zhiqiang Pang \email{zhiqiang.pang@mail.mcgill.ca}
#' Jeff Xia \email{jeff.xia@mcgill.ca}
#' Mcgill University
#' License: GNU GPL (>= 2)
optimize.xcms.doe <- function(raw_data, param, ncore = 8){
#### Parameters Setting for optimization!------
if (is.null(param)){
stop("Please provide the param with 'SetPeakParam' function !")
} else {Parameters<-param};
## Define the range of the parameters for optimization
if (Parameters$Peak_method=="centWave" && Parameters$RT_method=="loess"){
## Keep these Parameters
Parameters$value_of_prefilter <- Parameters$value_of_prefilter;
## Parameters for peak picking
Parameters$max_peakwidth <- c(Parameters$max_peakwidth*0.75,
Parameters$max_peakwidth*1.25);
Parameters$min_peakwidth <- c((Parameters$min_peakwidth)*0.75,
(Parameters$min_peakwidth)*1.25);
#Parameters$ppm <- c(Parameters$ppm*0.5,Parameters$ppm*1.5)
Parameters$mzdiff <- c(-Parameters$mzdiff*1.2,
Parameters$mzdiff*1.2);
Parameters$snthresh<-c(Parameters$snthresh*0.75,
Parameters$snthresh*1.25);
## Parameters for Alignment
Parameters$bw<-c(Parameters$bw*0.5,Parameters$bw*1.5);
} else
if (Parameters$Peak_method=="centWave" &&
Parameters$RT_method=="obiwarp"){
## Keep these Parameters
Parameters$value_of_prefilter <- Parameters$value_of_prefilter;
## Parameters for peak picking
Parameters$max_peakwidth <- c(Parameters$max_peakwidth*0.75,
Parameters$max_peakwidth*1.25)
Parameters$min_peakwidth <- c((Parameters$min_peakwidth)*0.75,
(Parameters$min_peakwidth)*1.25)
#Parameters$ppm <- c(1,Parameters$ppm*2);
Parameters$mzdiff <- c(-Parameters$mzdiff*2,
Parameters$mzdiff*2);
Parameters$snthresh<-c(Parameters$snthresh*0.75,
Parameters$snthresh*1.25)
## Parameters for Alignment
Parameters$bw<-c(Parameters$bw*0.5,Parameters$bw*1.5)
} else
if (Parameters$Peak_method=="matchedFilter" &&
Parameters$RT_method=="loess"){
## Parameters for peak picking
Parameters$fwhm <- c((Parameters$fwhm)*0.8,
(Parameters$fwhm)*1.2)
Parameters$sigma <- c(Parameters$sigma*0.8,
Parameters$sigma*1.2)
Parameters$steps <- c(Parameters$steps-1,
Parameters$steps+1)
Parameters$mzdiff <- c(-Parameters$mzdiff*2,
Parameters$mzdiff*2)
Parameters$snthresh<-c(1,Parameters$snthresh*10)
## Parameters for Alignment
Parameters$bw<-c(Parameters$bw*0.5,
Parameters$bw*1.5)
} else
if (Parameters$Peak_method=="matchedFilter" &&
Parameters$RT_method=="obiwarp"){
## Parameters for peak picking
Parameters$fwhm <- c((Parameters$fwhm)*0.8,
(Parameters$fwhm)*1.2)
Parameters$sigma <- c(Parameters$sigma*0.8,
Parameters$sigma*1.2)
Parameters$steps <- c(Parameters$steps-1,
Parameters$steps+1)
Parameters$mzdiff <- c(-Parameters$mzdiff*2,
Parameters$mzdiff*2)
Parameters$snthresh<-c(1,Parameters$snthresh*10)
## Parameters for Alignment
Parameters$bw<-c(Parameters$bw*0.5,
Parameters$bw*1.5)
} else {
stop("There must be something wrong about the
Peak_method value in your primary params set !")
};
MessageOutput("Preparing Parameters for optimization finished !", "\n", NULL);
if (.Platform$OS.type=="unix"){
bp <- MulticoreParam(ncore);
} else {
bp <- SnowParam(ncore);
};
#### Start to Optimize !
result <- optimizxcms.doe.peakpicking(object = raw_data,
params = Parameters,
BPPARAM = bp,
nSlaves = ncore,
subdir = NULL,
plot = FALSE);
optimizedxcmsObject <- result$best_settings$xset;
##Parameters Out-put
if (length(result) > 0) {
if (!is.null(result[["best_settings"]][["parameters"]])) {
best_parameters <- result[["best_settings"]][["parameters"]]
} else{
return(param)
}
} else{
return(param)
}
MessageOutput(paste0("Step 1/6: Parameters Optimization Finished ! (",
Sys.time(),")"), "\n", NULL);
return(best_parameters)
}
#' @title Core Optimization Function of DoE
#' @description This function is the core for parameters optimization
#' with Design of Experiment (DoE)
#' method.
#' @param object MSnExp object, the trimmed or the original data.
#' @param params List, the parameters lists set by 'SetPeakParam' function.
#' The noise, prefilter and ppm values should
#' be defined by AutoTuner in the previous steps.
#' @param nSlaves Numeric, core number used to perform the parallel based optimization.
#' @param BPPARAM MulticoreParam method, used to set the parallel method.
#' Default is bpparam().
#' @param plot Logical, weather to plot the Contours plots of the DoE results.
#' @param ... Other parameters.
#' @noRd
#' @import MSnbase
#' @import progress
#' @import parallel
#' @author Zhiqiang Pang \email{zhiqiang.pang@mail.mcgill.ca}
#' Jeff Xia \email{jeff.xia@mcgill.ca}
#' Mcgill University
#' License: GNU GPL (>= 2)
optimizxcms.doe.peakpicking <- function(object = NULL,
params = params,
BPPARAM = bpparam(),
nSlaves = 4,
plot = FALSE,...) {
isotopeIdentification = c("IPO");
centWave <- is.null(params$fwhm) ;
history <- list();
iterator = 1;
best_range <- 0.25;
object_mslevel<-PeakPicking_prep(object);
while(iterator < 20) {#Forcely stop to ensure the reasonability!
MessageOutput(paste0("Round:",iterator, "\nDoE Running Begin..."),
"\n", NULL)
mSet_OPT <-
ExperimentsCluster_doe(
object = object,
object_mslevel=object_mslevel,
params = params,
isotopeIdentification = isotopeIdentification,
BPPARAM = BPPARAM,
nSlaves = nSlaves,
iterator = iterator
)
MessageOutput(paste0("100% |"), "\n", NULL);
### Normalize the PPS and CV in mSet_OPT
PPS.set<-as.numeric(sapply(seq_len(nrow(mSet_OPT[["response"]])),
FUN=function(x){
mSet_OPT[["response"]][x,5]
}));
CV.set<-as.numeric(sapply(seq_len(nrow(mSet_OPT[["response"]])),
FUN=function(x){
mSet_OPT[["response"]][x,6]
}))
RCS.set<-as.numeric(sapply(seq_len(nrow(mSet_OPT[["response"]])),
FUN=function(x){
mSet_OPT[["response"]][x,7]
}))
GS.set<-as.numeric(sapply(seq_len(nrow(mSet_OPT[["response"]])),
FUN=function(x){
mSet_OPT[["response"]][x,8]
}))
GaussianSI.set<-as.numeric(sapply(seq_len(nrow(mSet_OPT[["response"]])),
FUN=function(x){
mSet_OPT[["response"]][x,9]
}))
index.set<-list(CV=CV.set,
RCS=RCS.set,
GS=GS.set,
GaussianSI=GaussianSI.set)
# Normalize the index
CV.set.normalized<-(CV.set-min(CV.set))/(max(CV.set)-min(CV.set))
RCS.set.normalized<-(RCS.set-min(RCS.set))/(max(RCS.set)-min(RCS.set))
GS.set.normalized<-(GS.set-min(GS.set))/(max(GS.set)-min(GS.set))
GaussianSI.set.normalized<-GaussianSI.set
# Calculate QCoE
QCoE<-CV.set.normalized*0.2+RCS.set.normalized*0.4+
GS.set.normalized*0.4
# Calculate QS
QS<-PPS.set*QCoE*GaussianSI.set.normalized;
tmp_matrix<-mSet_OPT[["response"]];
tmp_matrix<-cbind(tmp_matrix,PPS.set,CV.set.normalized,
RCS.set.normalized,GS.set.normalized,
GaussianSI.set.normalized,QCoE,QS)
mSet_OPT[["response"]]<-tmp_matrix;
MessageOutput(paste0("Round ",iterator," Finished !"), "\n", NULL)
mSet_OPT <-
tryCatch(Statistic_doe(
object = object,
object_mslevel=object_mslevel,
isotopeIdentification = isotopeIdentification,
BPPARAM = BPPARAM,
subdir = NULL,
plot = FALSE,
mSet_OPT = mSet_OPT,
iterator = iterator,
index.set = index.set,
useNoise = params[["noise"]]
), error = function(e) e)
if(is(mSet_OPT,"simpleError")){
MessageOutput("Optimization Failed: Too few peaks in your data !
Will use default parameters for processing!","\n")
break;
}
history[[iterator]] <- mSet_OPT
params <- mSet_OPT$params
increaRes <- resultIncreased_doe(history);
if(!is.null(increaRes)){
if(!increaRes) {
MessageOutput("No Increase Stopping !")
maxima <- 0
max_index <- 1
for(i in seq_along(history)) {
## TODO: need to think more on this discrimination criteria: based on max setting or max QS?
if(history[[i]]$max_settings[1] > maxima) {
maxima <- history[[i]]$max_settings[1]
max_index <- i
}
}
xcms_parameters <- mSet_OPT$OptiParams;
# as.list(decodeAll(history[[max_index]]$max_settings[-1],
# history[[max_index]]$params$to_optimize))
#
# xcms_parameters <- combineParams(xcms_parameters,
# params$no_optimization)
#
# if(!is.list(xcms_parameters))
# xcms_parameters <- as.list(xcms_parameters)
#
# # deal with the too narrow peak width issue
# pkmin <- xcms_parameters$min_peakwidth;
# pkmax <- xcms_parameters$max_peakwidth;
#
# if(abs(pkmax - pkmin) < 5 & pkmin > 5){
# xcms_parameters$max_peakwidth <- pkmax + 2.5;
# xcms_parameters$min_peakwidth <- pkmin - 2.5;
# } else if (abs(pkmax - pkmin) < 5 & pkmin < 5) {
# xcms_parameters$max_peakwidth <- pkmax + 5;
# }
best_settings <- list()
best_settings$parameters <- xcms_parameters
#best_settings$xset <- history[[max_index]]$xset
#target_value <- history[[max_index]]$QS
#best_settings$result <- target_value
history$best_settings <- best_settings
if(!.on.public.web()){
message("best parameter settings:")
cat(paste(rbind(paste(names(xcms_parameters), sep="", ": "),
paste(xcms_parameters, sep="", "\n")), sep=""))
}
return(history)
}
} else {
MessageOutput("Optimization Failed: Too few peaks in your data ! Will use default parameters for processing!","\n")
break;
}
for(i in seq_along(params$to_optimize)) {
parameter_setting <- mSet_OPT$max_settings[i+1]
bounds <- params$to_optimize[[i]]
fact <- names(params$to_optimize)[i]
min_factor <-
ifelse(fact=="min_peakwidth", 3,
ifelse(fact=="mzdiff",
ifelse(centWave,-0.02, 0.001),
ifelse(fact=="step",0.0005,
ifelse(fact=="bw",2,
ifelse(fact=="snthresh",5,1)))))
step_factor <-
ifelse(is.na(parameter_setting), 1.2,
ifelse((abs(parameter_setting) < best_range), 0.8,
ifelse(parameter_setting==-1 &
decode(-1, params$to_optimize[[i]]) ==
min_factor, 0.8, 1)))
step <- (diff(bounds) / 2) * step_factor
if(is.na(parameter_setting))
parameter_setting <- 0
new_center <- decode(parameter_setting, bounds)
if((new_center-min_factor) > step) {
new_bounds <- c(new_center - step, new_center + step)
} else {
new_bounds <- c(min_factor, 2*step+min_factor)
}
names(new_bounds) <- NULL
if(names(params$to_optimize)[i] == "steps" |
names(params$to_optimize)[i] == "prefilter") {
params$to_optimize[[i]] <- round(new_bounds, 0)
} else {
params$to_optimize[[i]] <- new_bounds
}
}
if(centWave) {
if(!is.null(params$to_optimize$min_peakwidth) |
!is.null(params$to_optimize$max_peakwidth)) {
pw_min <-
ifelse(is.null(params$to_optimize$min_peakwidth),
params$no_optimization$min_peakwidth,
max(params$to_optimize$min_peakwidth))
pw_max <-
ifelse(is.null(params$to_optimize$max_peakwidth),
params$no_optimization$max_peakwidth,
min(params$to_optimize$max_peakwidth))
if(pw_min >= pw_max) {
additional <- abs(pw_min-pw_max) + 1
if(!is.null(params$to_optimize$max_peakwidth)) {
params$to_optimize$max_peakwidth <-
params$to_optimize$max_peakwidth + additional
} else {
params$no_optimization$max_peakwidth <-
params$no_optimization$max_peakwidth + additional
}
}
}
}
params <- attachList(params$to_optimize, params$no_optimization)
iterator <- iterator + 1
}
MessageOutput(NULL, NULL, 19);
params <- attachList(params$to_optimize, params$no_optimization)
return(history)
}
#' @title Experiment Functions of DoE
#' @description This function is used to perform the test with Design of Experiment (DoE) on the parameters dataset.
#' @param object MSnExp object, the trimmed or the original data.
#' @param object_mslevel List, the parsed metabolomics scans produced by PeakPicking_prep.
#' @param params parameters set for optimization
#' @param isotopeIdentification Character, IsotopeIdentidication method, usually includes 'IPO' and 'CAMERA'.
#' @param BPPARAM MulticoreParam method, used to set the parallel method. Default is bpparam().
#' @param nSlaves Numeric, core number used to perform the parallel based optimization.
#' @param iterator round of DoE
#' @noRd
#' @import MSnbase
#' @importFrom rsm decode.data ccd rsm
#' @import progress
#' @import parallel
#' @importFrom parallel clusterExport
#' @author Zhiqiang Pang \email{zhiqiang.pang@mail.mcgill.ca} Jeff Xia \email{jeff.xia@mcgill.ca}
#' Mcgill University
#' License: GNU GPL (>= 2)
ExperimentsCluster_doe <-function(object, object_mslevel,params,
isotopeIdentification, BPPARAM = bpparam(),nSlaves=4, iterator) {
# To form a parameters combination table
typ_params <- typeCastParams(params)
if(length(typ_params$to_optimize)>1) {
design <- getCcdParameter(typ_params$to_optimize);
param_design <- rsm::decode.data(design);
} else {
design <- data.frame(run.order=seq_len(9), a=seq(-1,1,0.25))
colnames(design)[2] <- names(typ_params$to_optimize)
param_design <- design
param_design[,2] <-
seq(min(typ_params$to_optimize[[1]]),
max(typ_params$to_optimize[[1]]),
diff(typ_params$to_optimize[[1]])/8)
}
param_design <- combineParams(param_design, typ_params$no_optimization)
design_list <- apply(param_design,1,FUN = function(x){
as.list(x)
})
tasks <- seq_len(nrow(design));
if (.Platform$OS.type=="windows"){
MessageOutput("Your OS is Windows, there might be unexpected errors.\n")
MessageOutput("If there is some unexpected bugs, please reduce the 'core' as 1.\n")
}
if (.on.public.web()){
nSlaves <- 2;
}
# Parallel or single core
if(nSlaves > 1) {
if (.on.public.web()){
nstepby <- 5;
nstep<-ceiling(length(tasks)/nstepby);
} else {
# Memory optimization strategy
ncount <- object@phenoData@data[["sample_name"]];
data.size <- round(as.numeric(object.size(object) / 1024 / 1024), 1)
if (.Platform$OS.type == "unix") {
memtotal <-
ceiling(as.numeric(
system("awk '/MemTotal/ {print $2}' /proc/meminfo", intern = TRUE)
) / 1024 / 1024)
}
if (.Platform$OS.type == "windows") {
memtotal <-
ceiling(as.numeric(gsub(
"\r", "", gsub(
"TotalVisibleMemorySize=",
"",
system('wmic OS get TotalVisibleMemorySize /Value', intern = TRUE)[3]
)
)) / 1024 / 1024)
}
if (data.size < 1) {
data.size <- 0.5
}
if (memtotal / data.size > 30) {
nstepby <- ceiling(memtotal * 1.5 / (data.size * 32))
} else if (memtotal / data.size < 30 &&
memtotal / data.size > 15) {
nstepby <- ceiling(memtotal * 1 / (data.size * 32))
} else {
nstepby <- ceiling(memtotal * 0.5 / (data.size * 32))
}
nstep <- ceiling(length(tasks) / nstepby)
}
## Parallel Setting
#if('snow' %in% rownames(installed.packages())){
# unloadNamespace("snow")
#}
cl_type <- getClusterType()
cl <- parallel::makeCluster(nSlaves,type = cl_type)
response <- matrix(0, nrow=length(design[[1]]), ncol=9)
parallel::clusterExport(cl, .optimize_function_list, envir = asNamespace("OptiLCMS"))
# Setting progress bar and start the running loop
pb <- progress_bar$new(format = "DoE Running [:bar] :percent Time left: :eta", total = nstep, clear = TRUE, width= 75)
if (.on.public.web()){
print_mes <- paste0("Finished: ");
write.table(print_mes,file="metaboanalyst_spec_proc.txt",append = TRUE ,row.names = FALSE,col.names = FALSE, quote = FALSE, eol = "");
}
for (w in seq_len(nstep)){
if (.on.public.web()){
count_tmp <- pb$tick();
totalcount <- formals(count_tmp[["initialize"]])[["total"]];
currentcount <- environment(count_tmp[["tick"]])[["private"]][["current"]];
write.table((w/nstep*(iterator/4)*3.5+(iterator-1)*3.5+5), file = "log_progress.txt", row.names = FALSE,col.names = FALSE);
print_mes <- paste(round(w/(nstep+1), digits=2)*100, "%");
write.table(print_mes,file="metaboanalyst_spec_proc.txt",append = TRUE,row.names = FALSE,col.names = FALSE, quote = FALSE, eol = " | ");
} else {
pb$tick();
}
value.index<-tasks[c(seq_len(nstepby))+(w-1)*nstepby];
if (NA %in% tasks[c(seq_len(nstepby))+(w-1)*nstepby]){
value.index<-value.index[-which(is.na(tasks[c(seq_len(nstepby))+(w-1)*nstepby]))]
}
response0 <- parallel::parSapply(
cl = cl,
X = value.index,
FUN = SlaveCluster_doe,
Set_parameters = design_list,
object = object,
object_mslevel=object_mslevel,
isotopeIdentification = isotopeIdentification,
BPPARAM = BPPARAM,
USE.NAMES = FALSE,
singleCore = FALSE
)
response[value.index,]<-t(response0)
}
parallel::stopCluster(cl)
} else {
message("Processing .",appendLF = FALSE)
response <-
sapply(
X = tasks,
FUN = SlaveCluster_doe,
Set_parameters = design_list,
object = object,
object_mslevel=object_mslevel,
isotopeIdentification = isotopeIdentification,
BPPARAM = BPPARAM,
singleCore = TRUE
)
response <- t(response)
message(" OK, ",appendLF = FALSE)
}
colnames(response) <- c("exp", "num_peaks", "notLLOQP", "num_C13", "PPS","CV","RCS","GS","GaussianSI")
response <- response[order(response[,1]),]
ret <- list()
ret$params <- typ_params
ret$design <- design
ret$response <- response
return(ret)
}
#' @title Analyze DoE Result
#' @description Analyze Design of Experiment (DoE) Result
#' @param object MSnExp object, the trimmed or the original data.
#' @param object_mslevel List, the parsed metabolomics scans produced by PeakPicking_prep.
#' @param isotopeIdentification Character, IsotopeIdentidication method, usually includes 'IPO' and 'CAMERA'.
#' @param BPPARAM MulticoreParam method, used to set the parallel method. Default is bpparam().
#' @param mSet_OPT List, the result produced by 'ExperimentsCluster'.
#' @param subdir Logical, weather to creat a sub-directory (if true) or not (if false).
#' @param plot Logical, weather to plot the Contours plots of the DoE results.
#' @param iterator Numeric, the round number of the DoE.
#' @param index.set List, the indexes set (including PPS, CV, RCS, GS and Gaussian Index) produced by
#' ExperiemntCluster.
#' @param useNoise Numeric, the noise level removed to evalute the gaussian peak.
#' @noRd
#' @import MSnbase
#' @import parallel
#' @author Zhiqiang Pang \email{zhiqiang.pang@mail.mcgill.ca} Jeff Xia \email{jeff.xia@mcgill.ca}
#' Mcgill University
#' License: GNU GPL (>= 2)
Statistic_doe <-function(object, object_mslevel, isotopeIdentification,
BPPARAM = bpparam(), mSet_OPT, subdir = NULL ,plot = FALSE,iterator,
index.set,useNoise) {
MessageOutput(paste0("Model Parsing..."), "\n", NULL)
# Prepare parameters for model prediction
params <- mSet_OPT$params
resp <- mSet_OPT$response[, "QS"]
# Creat the prediction model and get the predicted best results
model <- createModel(mSet_OPT$design, params$to_optimize, resp)
mSet_OPT$model <- model
max_settings <- getMaximumLevels(mSet_OPT$model)
tmp <- max_settings[1,-1] # first row without response
tmp[is.na(tmp)] <- 1 # if Na (i.e. -1, and 1), show +1
mSet_OPT$max_settings <- max_settings
xcms_parameters <-
as.list(decodeAll(max_settings[-1], params$to_optimize))
xcms_parameters <-
combineParams(xcms_parameters, params$no_optimization)
if(!is.list(xcms_parameters))
xcms_parameters <- as.list(xcms_parameters)
# deal with the too narrow peak width issue
pkmin <- xcms_parameters$min_peakwidth;
pkmax <- xcms_parameters$max_peakwidth;
if(abs(pkmax - pkmin) < 7.5 & pkmin > 7.5){ # TO ENSURE THE PEAK WIDTH IS OVER 7.5
xcms_parameters$max_peakwidth <- pkmax + 3.75;
xcms_parameters$min_peakwidth <- pkmin - 3.75;
} else if (abs(pkmax - pkmin) < 7.5 & pkmin < 7.5) {
xcms_parameters$max_peakwidth <- pkmax + 7.5;
}
# Detect the peak features with the predicted best parameters
mSet <- suppressMessages(calculateSet_doe(object = object, object_mslevel=object_mslevel,
Set_parameters = xcms_parameters,
task = 1, BPPARAM = BPPARAM));
if(!is(mSet, "mSet")){ # All params failed - avoid this corner case
Eset <- list()
Eset$RCS <- Eset$CV <- Eset$PPS <- 0;
mSet_OPT$QS <- 0;
mSet_OPT$PPS <- Eset;
mSet_OPT$OptiParams <- xcms_parameters;
if (.on.public.web()){
print_mes <- paste0("Model Parsing Done !\n");
write.table(print_mes,file="metaboanalyst_spec_proc.txt",append = TRUE, row.names = FALSE,col.names = FALSE, quote = FALSE, eol = "\n");
} else {
message("Model Parsing Done !\n")
}
return(mSet_OPT)
}
# xset <- mSet[["xcmsSet"]]
# Calculate the various indexes
#mSet_OPT$xset <- xset
mSet_OPT$PPS <- calcPPS2(mSet, isotopeIdentification)
suppressWarnings(mSet_OPT$PPS$CV <-
suppressMessages(calcCV(mSet)))
mSet_OPT$PPS$RCS <- suppressMessages(calcRCS_GSValues(mSet)$RCS)
mSet_OPT$PPS$GS <- suppressMessages(calcRCS_GSValues(mSet)$GS)
mSet_OPT$PPS$GaussianSI <-
calcGaussianS(mSet, object, useNoise = useNoise)
MessageOutput(paste0("Gaussian peak ratio (%): ", round(mSet_OPT$PPS$GaussianSI,3)*100, ""));
#MessageOutput(paste0("Total peak number is: ", as.numeric(mSet_OPT$PPS[5])));
#save(mSet_OPT, file = paste0("mSet_",Sys.time(),"_780.rda"));
## Normalize the CV, RCS, GS, GaussianSI
normalized.CV<-(mSet_OPT$PPS$CV-min(index.set$CV))/(max(index.set$CV)-min(index.set$CV));
normalized.RCS<-(mSet_OPT$PPS$RCS-min(index.set$RCS))/(max(index.set$RCS)-min(index.set$RCS));
normalized.GS<-(mSet_OPT$PPS$GS-min(index.set$GS))/(max(index.set$GS)-min(index.set$GS));
normalized.GaussianSI<-mSet_OPT$PPS$GaussianSI;
## Calculate the QS for the best combination in current iterator!
QCoE<-0.2*(normalized.CV)+0.4*(normalized.GS+normalized.RCS);
mSet_OPT$QS<-as.numeric(mSet_OPT$PPS[5])*QCoE*normalized.GaussianSI^1.5;
mSet_OPT$OptiParams <- xcms_parameters;
MessageOutput(paste0("Model Parsing Done !\n"), "\n", NULL)
return(mSet_OPT)
}
#' @title Core Peak Picking Slave Cluster
#' @description Core Peak Picking Slave Cluster
#' @param task Numeric, task order for XCMS paramters table to run the peak picking and alignment.
#' @param Set_parameters Matrix, the parameters combination produced automatically according to
#' the primary parameters input.
#' @param object object, the trimmed or the original data.
#' @param object_mslevel List, the parsed metabolomics scans produced by PeakPicking_prep.
#' @param isotopeIdentification Character, IsotopeIdentidication method, usually includes 'IPO' and 'CAMERA'.
#' @param BPPARAM MulticoreParam method, used to set the parallel method. Default is bpparam().
#' @noRd
#' @import MSnbase
#' @author Zhiqiang Pang \email{zhiqiang.pang@mail.mcgill.ca} Jeff Xia \email{jeff.xia@mcgill.ca}
#' Mcgill University
#' License: GNU GPL (>= 2)
SlaveCluster_doe <-function(task, Set_parameters, object, object_mslevel,
isotopeIdentification, BPPARAM = bpparam(), singleCore = FALSE) {
mSet <-
calculateSet_doe(
object = object,
object_mslevel=object_mslevel,
Set_parameters = Set_parameters,
task = task,
BPPARAM = BPPARAM
)
if(singleCore){
#MessageOutput(paste("Finished", task,"/",length(Set_parameters),"in this round !"), SuppressWeb = TRUE)
message(".",appendLF = FALSE)
}
if (!is(mSet, "character")){
#MessageOutput("Peak Feature Analyzing...", SuppressWeb = TRUE)
#xset <- mSet[["xcmsSet"]]
result <- calcPPS2(mSet, isotopeIdentification)
result[1] <- task
tmp_cv<- try(suppressMessages(calcCV(mSet)),silent = TRUE);
if (is(tmp_cv, "try-error")){
result[6]<-0;
} else {
result[6] <- tmp_cv;
}
tmp_RCS_GS <- try(suppressMessages(calcRCS_GSValues(mSet)),silent = TRUE);
if (is(tmp_RCS_GS,"try-error")){
result[8] <- result[7] <- 0;
} else {
result[7] <- tmp_RCS_GS$RCS;
result[8] <- tmp_RCS_GS$GS;
};
tmp_GaussianSI <- try(calcGaussianS(mSet, object,
useNoise = as.numeric(Set_parameters[[task]]$noise)),
silent = TRUE);
if (is(tmp_GaussianSI,"try-error")){
result[9]<-0;
} else {
result[9]<-tmp_GaussianSI
};
names(result)[c(6,7,8,9)]<-c("CV","RCS","GS","GaussianSI")
#result
#MessageOutput("Peak Feature Analyzing Done !\n", SuppressWeb = TRUE)
} else{
result<-c(task,0,0,0,0,0,0,0,0)
}
return(result)
}
#' @title Cluster of Peak Picking and Alignment
#' @description Cluster of Peak Picking and Alignment
#' @param object MSnExp object, the trimmed or the original data.
#' @param object_mslevel List, the parsed metabolomics scans produced by PeakPicking_prep.
#' @param Set_parameters Matrix, the parameters combination produced automatically according to
#' the primary parameters input.
#' @param task Numeric, task order for XCMS paramters table to run the peak picking and alignment.
#' @param BPPARAM MulticoreParam method, used to set the parallel method. Default is bpparam().
#' @noRd
#' @import MSnbase
#' @author Zhiqiang Pang \email{zhiqiang.pang@mail.mcgill.ca}
#' Jeff Xia \email{jeff.xia@mcgill.ca}
#' Mcgill University
#' License: GNU GPL (>= 2)
calculateSet_doe <- function(object, object_mslevel, Set_parameters, task = 1,
BPPARAM = bpparam()) {
if (length(Set_parameters) < 33){ # deal with the usual processing case
param <- updateRawSpectraParam(Set_parameters)
} else { # deal with the optimization case, usaully 44 (33 each set)
param <- updateRawSpectraParam(Set_parameters[[task]])
}
mSet <- calculatePPKs(object, object_mslevel, param, BPPARAM = bpparam())
mSet <- calculateGPRT(mSet, param)
return(mSet)
}
#' @title Peak picking Method
#' @description Wrapped Peak picking Method
#' @param object MSnExp object, the trimmed or the original data.
#' @param object_mslevel List, the parsed metabolomics scans produced by PeakPicking_prep.
#' @param param Matrix, the parameters combination produced automatically according to
#' @param BPPARAM MulticoreParam method, used to set the parallel method. Default is bpparam().
#' @param msLevel Numeric, to specifiy the msLevel, only 1 permitted for now. 2 will be supported
#' in the near future.
#' @import MSnbase
#' @noRd
#' @author Zhiqiang Pang \email{zhiqiang.pang@mail.mcgill.ca}
#' Jeff Xia \email{jeff.xia@mcgill.ca}
#' Mcgill University
#' License: GNU GPL (>= 2)
calculatePPKs<-function(object, object_mslevel,param,
BPPARAM = bpparam(),msLevel = 1){
if (param$Peak_method == "centWave" | param$Peak_method == "matchedFilter") {
mSet <- try(PeakPicking_core(object, object_mslevel,
param = param,
msLevel = 1),
silent = TRUE)
} else {
stop("Other peak picking method cannot be supported for now !")
}
}
#' @title Alignment Method
#' @description Wrapped Peak Alignment Method
#' @param mSet mSet object, the data produced by 'calculatePPKs' function.
#' @param param Matrix, the parameters combination
#' produced automatically according to
#' @noRd
#' @import MSnbase
#' @author Zhiqiang Pang \email{zhiqiang.pang@mail.mcgill.ca}
#' Jeff Xia \email{jeff.xia@mcgill.ca}
#' Mcgill University
#' License: GNU GPL (>= 2)
calculateGPRT<-function (mSet,param){
mSetTmp <- new("mSet");
mSetTmp@params <- param;
mSetTmp@params$BlankSub <- FALSE;
if(!is(mSet, "try-error")){
mSetTmp@peakpicking <- mSet$msFeatureData;
mSetTmp@rawInMemory <- mSet$inMemoryData;
}
mSetTmp <- try(PerformPeakAlignment(mSetTmp),silent = TRUE);
gc();
mSetTmp <- try(PerformPeakFiling (mSetTmp),silent = TRUE);
gc();
if (is(mSetTmp,"try-error")) {
mSet<-"Xset_NA";
} else {
mSet <- mSetTmp;
}
return(mSet)
}
#' @title Calculate PPS method
#' @description Calculate PPS method
#' @param mSet xcmsSet Object, this object is produced by 'calculateSet_doe'
#' function, and transformed
#' with as(objec,'xcmsSet') function.
#' @param isotopeIdentification Character, IsotopeIdentidication method,
#' usually includes 'IPO' and 'CAMERA'.
#' @noRd
#' @author Zhiqiang Pang \email{zhiqiang.pang@mail.mcgill.ca}
#' Jeff Xia \email{jeff.xia@mcgill.ca}
#' Mcgill University
#' License: GNU GPL (>= 2)
calcPPS2 <- function(mSet, isotopeIdentification=c("IPO", "CAMERA")) {
isotopeIdentification <- match.arg(isotopeIdentification);
ret <- vector(mode="numeric", 5); #array(0, dim=c(1,5))
names(ret) <- c("ExpId", "#peaks", "#NonRP", "#RP", "PPS");
if(is.null(mSet)) {
return(ret)
}
if(nrow(peaks_IPO(mSet)) == 0) {
return(ret)
}
peak_source <- peaks_IPO(mSet)[,c("mz", "rt", "sample", "into", "mzmin",
"mzmax", "rtmin", "rtmax"),drop=FALSE]
ret[2] <- nrow(peak_source)
if(isotopeIdentification == "IPO")
iso_mat <- findIsotopes.IPO(mSet)
else
iso_mat <- findIsotopes.CAMERA(mSet)
samples <- unique(peak_source[,"sample"]);
isotope_abundance = 0.01108;
#calculating low intensity peaks
for(sample in samples) {
non_isos_peaks <- peak_source
if(nrow(iso_mat) > 0) {
non_isos_peaks <- peak_source[-unique(c(iso_mat)),,drop=FALSE]
}
speaks <- non_isos_peaks[non_isos_peaks[,"sample"]==sample,,drop=FALSE]
intensities <- speaks[,"into"]
na_int <- is.na(intensities)
intensities <- intensities[!na_int]
if(length(intensities)>0) {
tmp <- intensities[order(intensities)]
int_cutoff <- mean(tmp[seq_len(max(round((length(tmp)/33),0),1))])
masses <- speaks[!na_int, "mz"]
#floor((masses-2*CH3)/CH2) + 2
maximum_carbon <- calcMaximumCarbon(masses)
carbon_probabilty <- maximum_carbon*isotope_abundance
iso_int <- intensities * carbon_probabilty
not_loq_peaks <- sum(iso_int>int_cutoff)
ret[3] <- ret[3] + not_loq_peaks
}
}#end_for_sample
ret[4] <- length(unique(c(iso_mat)))
if(ret[3] == 0) {
ret[5] <- (ret[4]+1)^1.5/(ret[3]+1)
} else {
ret[5] <- ret[4]^1.5/ret[3]
}
return(ret)
}
#' @title Calculatre CV method
#' @description Calculatre CV method
#' @param mSet mSet Object, this object is produced by 'calculateSet_doe' function.
#' @noRd
#' @author Zhiqiang Pang \email{zhiqiang.pang@mail.mcgill.ca}
#' Jeff Xia \email{jeff.xia@mcgill.ca}
#' Mcgill University
#' License: GNU GPL (>= 2)
calcCV<-function(mSet){
ncount<-length(mSet@rawInMemory@phenoData[["sample_name"]])
table.data<-creatPeakTable(mSet);
table.data$abundance.mean <- apply(table.data[, 9:(8 + ncount)],1,
FUN = mean, na.rm = TRUE);
table.data$abundance.sd <- apply(table.data[, 9:(8 + ncount)],1,
FUN = sd, na.rm = TRUE);
table.data$abundance.cv <-
(table.data$abundance.sd * 100)/table.data$abundance.mean;
cv.min<-min(table.data$abundance.cv, na.rm = TRUE);
cv.0.25<-as.numeric(quantile(table.data$abundance.cv, probs = 0.25, na.rm = TRUE));
cv.med<-median(table.data$abundance.cv, na.rm = TRUE);
cv.0.75<-as.numeric(quantile(table.data$abundance.cv, probs = 0.75, na.rm = TRUE));
cv.max<-max(table.data$abundance.cv, na.rm = TRUE);
cv.score<-1/(0.75*cv.med+0.25*(cv.max-cv.med))
return(cv.score)
}
#' @title Calculatre RCS and GS method
#' @description Calculatre RCS and GS method
#' @param mSet mSet Object, this object is produced by 'calculateSet_doe' function.
#' @noRd
#' @author Zhiqiang Pang \email{zhiqiang.pang@mail.mcgill.ca}
#' Jeff Xia \email{jeff.xia@mcgill.ca}
#' Mcgill University
#' License: GNU GPL (>= 2)
calcRCS_GSValues<-function(mSet){
score.ret<-getRGTVValues(mSet)
return(list(GS=score.ret[["GS"]],RCS=score.ret[["RCS"]]))
}
#' @title Calculatre Gaussian Peak Ratio method
#' @description Calculatre Gaussian Peak Ratio method
#' @param mSet OptiLCMS Object, this object is produced by 'calculateSet_doe' function.
#' @param object MSnExp object, the trimmed or the original data
#' (Generated by ImportRawMSData function with "inMemory" mode).
#' @param useNoise Numeric, the noise level removed to evalute the gaussian peak.
#' @param BPPARAM MulticoreParam method, used to set the parallel method. Default is bpparam().
#' @noRd
#' @author Zhiqiang Pang \email{zhiqiang.pang@mail.mcgill.ca} Jeff Xia \email{jeff.xia@mcgill.ca}
#' Mcgill University
#' License: GNU GPL (>= 2)
calcGaussianS <-function(mSet, object, useNoise, BPPARAM = bpparam()){
if (identical(useNoise, numeric(0))) {
useNoise <- 0
}
peakmat <- mSet@peakfilling$msFeatureData$chromPeaks;
peakmat_set <- split.data.frame(peakmat, peakmat[, "sample"])
object <- mSet@rawInMemory;
## Protect the environment from overwrite
newAssayEnvir <- new.env();
copyEnv(mSet@rawInMemory@assayData, newAssayEnvir);
object@assayData <- newAssayEnvir;
## Adjusted RT application
scan_names <- sort(names(object@assayData));
adjRTs <- unname(unlist(mSet@peakfilling[["msFeatureData"]][["adjustedRT"]]))
for(s in seq_along(object)){
scanNM <- scan_names[s];
object@assayData[[scanNM]]@rt <- adjRTs[s];
}
## select different platforms
if (.Platform$OS.type == "unix") {
BPPARAM = MulticoreParam(2L);
res <- bplapply(peakmat_set, extFUN,
object = object,
useNoise = useNoise,
BPPARAM = BPPARAM)
# res <- lapply(peakmat_set, extFUN, object = object,
# useNoise = useNoise)
# print(res)
}
if (.Platform$OS.type == "windows") {
res <- lapply(peakmat_set, extFUN, object = object,
useNoise = useNoise)
res <- unlist(res)
}
rm(object);
rm(newAssayEnvir);
gc();
return(mean(sapply(res, FUN = function(x) {
x
})))
}
#' @importFrom stats nls
SSgaussStats <- function(ints){
startValue <- try(nls.start(y ~ GaussModel(x, mu, sigma, h),
data.frame(x = seq_along(ints), y = ints)),
silent = TRUE);
fit <- try(nls(y ~ GaussModel(x, mu, sigma, h),
data.frame(x = seq_along(ints), y = ints),
start = startValue),
silent = TRUE)
return(fit)
}
#' @importFrom stats fitted cor.test
extFUN <- function(z, object, useNoise) {
if (nrow(z) > 150) {
z <- extractPeakForGaussian(z)
# z <- z[sort(sample(seq_len(nrow(z)), 150)), ]
}
currentSample <- suppressMessages(
MSnbase::filterRt(MSnbase::filterFile(object,
z[1, "sample"]),
rt = range(z[, c("rtmin", "rtmax")])))
corr <- unlist(sapply(seq_len(nrow(z)), FUN = function(i) {
corr <- 0.1;
mzRange <- z[i, c("mzmin", "mzmax")] + c(-0.001, 0.001);
rtRange <- z[i, c("rtmin", "rtmax")];
ints <- try(suppressWarnings(MSnbase::intensity(
MSnbase::filterMz(MSnbase::filterRt(currentSample,
rtRange),
mzRange))), silent = TRUE)
if(is(ints, "try-error")){
return(corr);
}
ints[lengths(ints) == 0] <- 0
ints <- as.integer(unlist(ints))
ints <- ints[!is.na(ints)]
ints <- ints[ints > useNoise]
if (length(ints)) {
ints <- ints - min(ints)
# if (max(ints) > 0)
# ints <- ints/max(ints)
fit <- SSgaussStats(ints);
if (is(fit, "try-error")) { # Still error - record error !
#write.table(paste0(fit[[1]],Sys.time()), file = "error_report.txt",
#append = TRUE,eol = "\n");
corr <- 0.1;
} else {
if (sum(!is.na(ints - fitted(fit))) > 4 &&
sum(!is.na(unique(ints))) > 4 && sum(!is.na(unique(fitted(fit)))) >
4) {
cor <- NULL;
old <- options();
on.exit(options(old));
options(show.error.messages = FALSE);
cor <- try(cor.test(ints, fitted(fit),
method = "pearson", use = "complete"));
if (!is.null(cor) && cor$p.value <= 0.05) {
corr <- cor$estimate
}
else if (!is.null(cor) && cor$p.value >
0.05) {
corr <- cor$estimate * 0.85 # Give a penalty on that!
}
}
else {
corr <- 0.1
}
}
}
return(corr)
}))
gaussian.peak.ratio <- nrow(z[corr >= 0.9, , drop = FALSE])/nrow(z)
return(gaussian.peak.ratio)
}
extractPeakForGaussian <- function(z){
oz <- z[,"into"][order(z[,"into"])];
znum <- nrow(z);
ozs <- seq(from = 1,
to = znum,
by = ceiling(znum/150))
return(z[sort(names(oz[ozs])),])
}
#' @title Identify whether results improved or not
#' @description Identify whether results improved or not
#' @param history List, an interal media objects used to save the optimization results of peaks.
#' @noRd
#' @author Zhiqiang Pang \email{zhiqiang.pang@mail.mcgill.ca} Jeff Xia \email{jeff.xia@mcgill.ca}
#' Mcgill University
#' License: GNU GPL (>= 2)
resultIncreased_doe <- function(history) {
index = length(history)
if(history[[index]]$PPS$PPS == 0 & index == 1){
MessageOutput(paste("Error: No isotopes detected in your
Intensive ROI of your data, Please manually customize the parameter!"), "\n")
return(NULL)
}
if(index < 2)
return(TRUE)
if(history[[index-1]]$QS >= history[[index]]$QS)
return(FALSE)
return(TRUE)
}
#' @title Noise_evaluation based on Kernal density model
#' @description This functions handles the evaluation on the data noise (noise and prefilter parameters)
#' and the identification on the molecule weights deviation evaluation.
#' @param raw_data MSnExp object, the (trimmed) data in memory produced by 'PerformDataTrimming'.
#' @import MSnbase
#' @import progress
#' @importFrom stats sd
#' @importFrom stats weighted.mean
#' @noRd
#' @references McLean C (2020). Autotuner: Automated parameter
#' selection for untargeted metabolomics data processing
#' @author Zhiqiang Pang \email{zhiqiang.pang@mail.mcgill.ca}
#' Jeff Xia \email{jeff.xia@mcgill.ca}
#' Mcgill University
#' License: GNU GPL (>= 2)
Noise_evaluate <- function (raw_data) {
mSet <- list()
mSet$time <- mSet$intensity <- list()
mSet$time <- split(MSnbase::rtime(raw_data), fromFile(raw_data))
mSet$intensity <- split(MSnbase::tic(raw_data), fromFile(raw_data))
signals <- suppressMessages(lapply(mSet$intensity, FUN = function(y) {
lag <- 15
threshold <- 2
influence <- 0.1
signals <- rep(0, length(y))
filteredY <- y[seq_len(lag)]
avgFilter <- NULL
stdFilter <- NULL
avgFilter[lag] <- mean(y[seq_len(lag)])
stdFilter[lag] <- sd(y[seq_len(lag)])
for (i in (lag + 1):length(y)) {
if (abs(y[i] - avgFilter[i - 1]) > threshold * stdFilter[i -
1]) {
if (y[i] > avgFilter[i - 1]) {
signals[i] <- 1
}
else {
signals[i] <- -1
}
filteredY[i] <- influence * y[i] + (1 - influence) *
filteredY[i - 1]
}
else {
signals[i] <- 0
filteredY[i] <- y[i]
}
avgFilter[i] <- mean(filteredY[(i - lag):i])
stdFilter[i] <- sd(filteredY[(i - lag):i])
}
return(list(signals = signals, avgFilter = avgFilter,
stdFilter = stdFilter))
}))
factorCol <- "Groups"
Groups <- basename(raw_data@processingData@files)
metadata <- data.frame(Sample.Name = basename(fileNames(raw_data)),
Groups = Groups)
sample_names <- paste(unlist(metadata[, factorCol]), seq_len(nrow(metadata)))
peakList <- list();
varExpThresh <- 0.8;
for (index in seq_along(sample_names)) {
peaks <- rle(signals[[index]]$signals)
counter <- 1
peakGroups <- list()
for (rleIndex in seq_along(peaks$lengths)) {
curValue <- peaks$values[rleIndex]
if (curValue == 1) {
peakGroups[[counter]] <-
data.frame(index = (startPoint + 1), length = peaks$lengths[rleIndex])
startPoint <- startPoint + peaks$lengths[rleIndex]
counter <- counter + 1
}
else {
if (rleIndex == 1) {
startPoint <- peaks$lengths[rleIndex]
}
else {
startPoint <- startPoint + peaks$lengths[rleIndex]
}
}
}
peakGroups <- Reduce(peakGroups, f = rbind)
signals[[index]]$signals[peakGroups$index[peakGroups$length ==
1] + 1] <- 1
findPeaks <- rle(signals[[index]]$signals == 1)
counter <- 1
peakGroups <- list()
for (rleIndex in seq_along(findPeaks$lengths)) {
curValue <- findPeaks$values[rleIndex]
if (curValue) {
start <- (startPoint + 1)
startPoint <- startPoint + findPeaks$lengths[rleIndex]
end <- startPoint
peakGroups[[counter]] <- data.frame(start, end,
length = end - start + 1)
counter <- counter + 1
}
else {
if (rleIndex == 1) {
startPoint <- findPeaks$lengths[rleIndex]
}
else {
startPoint <- startPoint + findPeaks$lengths[rleIndex]
}
}
}
peakGroups <- Reduce(rbind, peakGroups)
npeaks <- 10;
if (nrow(peakGroups) < 10) {
npeaks <- nrow(peakGroups)
#warning("You are strongly advised to use wider rt.idx or more
#samples or other trimming strategy !")
}
peakGroups <- peakGroups[order(peakGroups$length, decreasing = TRUE),
]
peak_times <- list()
for (j in seq_len(nrow(peakGroups))) {
peak_times[[j]] <- mSet$time[[index]][peakGroups$start[j]:peakGroups$end[j]]
}
max_peak_length <- max(vapply(X = peak_times, FUN = length,
FUN.VALUE = numeric(1)))
peak_table <- data.frame(matrix(nrow = max_peak_length,
ncol = npeaks + 1))
peak_table[, 1] <- seq_len(max_peak_length)
colnames(peak_table) <- c("peakLenth", paste("peak",
seq_len(npeaks)))
for (column in seq(from = 2, to = ncol(peak_table))) {
peak <- peak_times[[column - 1]]
peak_table[c(seq_along(peak)), column] <- peak
}
peak_table$peakLenth <- NULL
peakList[[index]] <- peak_table
}
names(peakList) <- sample_names
peak_table <- data.frame()
counter <- 1
for (sampleIndex in seq_along(peakList)) {
time <- mSet$time[[sampleIndex]]
intensity <- mSet$intensity[[sampleIndex]]
peaks <- peakList[[sampleIndex]]
peakIndexTable <- data.frame()
for (peakColIndex in seq_len(ncol(peaks))) {
tempPeakWidthEst <- peakwidth_est(peak_vector = peaks[,
peakColIndex],
time, intensity, start = NULL,
end = NULL, old_r2 = NULL)
if (length(time)/5 < diff(tempPeakWidthEst)) {
next();
# stop(paste("One peak was over 1/5 of all scans in length.",
# "This is probably an error."))
}
if (tempPeakWidthEst[2] > length(time)) {
tempPeakWidthEst[2] <- length(time)
}
peakIndexTable <- rbind(peakIndexTable, c(tempPeakWidthEst,
peakColIndex))
}
colnames(peakIndexTable) <- c("peakStart", "peakEnd",
"peakID")
for (curPeakIndexCol in seq_along(peakIndexTable$peakStart)) {
start <- peakIndexTable$peakStart[curPeakIndexCol]
end <- peakIndexTable$peakEnd[curPeakIndexCol]
storeRow <- data.frame(peak_width = time[end] -
time[start], Start_time = time[start],
End_time = time[end],
Start_name = names(time)[start],
End_name = names(time)[end],
Sample = sampleIndex,
Mid_point_time = (time[start] +
time[end])/2,
Max_intensity = max(intensity[start:end]),
Maxima_time = time[start + which(intensity[start:end] %in%
max(intensity[start:end]))])
peak_table <- rbind(peak_table, storeRow)
counter <- counter + 1
}
}
pb <- progress_bar$new(format = "Evaluating [:bar] :percent Time left: :eta",
total = length(unique(peak_table$Sample)), clear = TRUE,
width = 75)
totalEstimates <- list()
for (j in unique(peak_table$Sample)) {
pb$tick()
currentTable <- peak_table[peak_table$Sample == j, ]
raw_data_current <- filterFile(raw_data, j)
header_rt <- unname(mSet[["time"]][[j]])
header <- MSnbase::header(raw_data_current)
allMzs <- MSnbase::mz(raw_data_current)
allInt <- MSnbase::intensity(raw_data_current)
mzDb <- list()
for (i in seq_along(allInt)) {
mzDb[[i]] <- cbind(mz = allMzs[[i]], intensity = allInt[[i]])
}
rm(allMzs, allInt)
pickedParams <- list()
for (curPeak in seq_len(nrow(currentTable))) {
start <- currentTable[curPeak, "Start_time"]
end <- currentTable[curPeak, "End_time"]
width <- currentTable$peak_width[curPeak]
observedPeak <- list(start = start, end = end)
rate <- mean(diff(header_rt[header$ms.level == 1L]))
scansOfPeak <- which(observedPeak$start < header_rt &
header_rt < observedPeak$end)
peakHead <- header[scansOfPeak, ]
ms1 <- peakHead$ms.level == 1L
rm(peakHead, ms1)
peakMassSpectras <- mzDb[scansOfPeak]
for (i in seq_along(scansOfPeak)) {
peakMassSpectras[[i]] <- cbind(peakMassSpectras[[i]],
scansOfPeak[i])
}
peakMassSpectras <- Reduce(rbind, peakMassSpectras)
colnames(peakMassSpectras) <- c("mz", "intensity",
"scan")
peakMassSpectras <- data.frame(peakMassSpectras)
peakMassSpectras <- peakMassSpectras[order(peakMassSpectras$mz), ]
peakMassSpectras <- peakMassSpectras[peakMassSpectras$intensity > 0, ]
sortedAllEIC <- peakMassSpectras
matchedMasses <- rle(diff(sortedAllEIC$mz) < 0.005)
noiseAndPeaks <- filterPeaksfromNoise(matchedMasses)
no_match <- noiseAndPeaks[[1]]
truePeaks <- noiseAndPeaks[[2]]
rm(noiseAndPeaks)
approvedPeaks <- findTruePeaks(truePeaks, sortedAllEIC);
if(is.null(approvedPeaks)){(next)()}
overlappingScans <- sum(approvedPeaks$multipleInScan)
ppmEst <- try(filterPpmError(approvedPeaks, useGap = TRUE,
varExpThresh, returnPpmPlots = FALSE,
plotDir = NULL, observedPeak),
silent = TRUE);
if(is(ppmEst, "try-error") | is.na(ppmEst)){(next)()}
ppmObs <- approvedPeaks$meanPPM
ppmObs <- strsplit(split = ";", x = as.character(ppmObs))
ppmObs <- lapply(ppmObs, as.numeric)
noisyBin <- unlist(lapply(ppmObs, function(ppm) {
any(ppm > ppmEst)
}))
approvScorePeaks <- approvedPeaks[!noisyBin, ]
SNest <- try(estimateSNThresh(no_match, sortedAllEIC,
approvScorePeaks), silent = TRUE);
if(is(SNest,"try-error")){(next)()}
SNest <- suppressWarnings(min(SNest));
if (is.infinite(SNest)) {
SNest <- 25
}
scanEst <- min(approvScorePeaks$scanCount)
noiseEst <- min(approvScorePeaks$minIntensity) -
1000
if (noiseEst < 0) {
noiseEst <- 0
}
intensityEst <- min(approvScorePeaks$Intensity)/sqrt(2)
estimatedPeakParams <- data.frame(ppm = ppmEst,
noiseThreshold = noiseEst,
peakCount = nrow(approvedPeaks),
prefilterI = intensityEst,
prefilterScan = scanEst,
TenPercentQuanSN = unname(SNest))
if (is.null(estimatedPeakParams)) {
next
}
pickedParams[[curPeak]] <- cbind(estimatedPeakParams,
startTime = start,
endTime = end, sampleID = j)
}
sampleParams <- Reduce(rbind, pickedParams)
totalEstimates[[j]] <- sampleParams
}
eicParamEsts <- Reduce(rbind, totalEstimates)
param <- list()
param$ppm <- round(weighted.mean(eicParamEsts$ppm, eicParamEsts$peakCount),1)
param$noise <- min(eicParamEsts$noiseThreshold, na.rm = TRUE)
param$value_of_prefilter <- min(eicParamEsts$prefilterI,
na.rm = TRUE)
param$prefilter <- min(eicParamEsts$prefilterScan, na.rm = TRUE)
param
}
##### -------====== function kit from package IPO ====----######
#' @references Gunnar Libiseller et al. 2015. IPO: a tool for automated
#' optimization of XCMS parameters
#' @references https://github.com/glibiseller/IPO
getClusterType <- function() {
if( .Platform$OS.type=="unix" ) {
return("FORK")
}
return("PSOCK")
}
peaks_IPO <- function(mSet) {
peaks_act <-mSet@peakfilling$msFeatureData$chromPeaks;
if(is.null(peaks_act)){
peaks_act <-mSet@peakRTcorrection[["chromPeaks"]];
}
if (!("sample" %in% colnames(peaks_act))) {
colnames(peaks_act)[colnames(peaks_act) == ""] <- "sample"
}
peaks_act
}
calcMaximumCarbon <- function(masses) {
carbon = 12.0
hydrogen = 1.0078250170
CH3 = carbon + 3 * hydrogen
CH2 = carbon + 2 * hydrogen
maximum_carbon <- floor((masses-2*CH3)/CH2) + 2
}
getMaximumLevels <- function(model) {
# dimension of the modeled space
dimensions <- length(model$coding)
# define grid, to test for maximum
if(dimensions > 6) {
testSpace <- seq(-1,1,0.2) # 11 points
} else {
testSpace <- seq(-1,1,0.1) # 21 points
}
testSize <- 10^6 # maximum number of grid points for one test
# amount for testing each point in testSpace
testAmount <- length(testSpace)^dimensions
i <- 1
max <- rep(-1, dimensions+1) # start maximum response + setting
# if there are more test points (=testAmount), than testSize,
# then the tests are split and each subset is tested seperately
while(i < testAmount) {
testdata <- expand.grid.subset(i:(i+testSize), testSpace, dimensions)
if(dimensions==1)
names(testdata) <- names(model$coding)
max_tmp <- getMaxSettings(testdata, model)
if(max_tmp[1]>max[1]) # if better solution (i.e. test response)
max <- max_tmp
i <- i + testSize + 1
}
return(max)
}
getMaxSettings <- function(testdata, model) {
response <- predict(model, testdata)
max_response <- max(response)
# select row(s) corresponding to max
max_settings <- testdata[response==max_response,,drop=FALSE]
ret <- max_response
for(i in seq_len(ncol(testdata))) {
levels <- max_settings[,i] # all settings of variable i
if(all(c(-1,1) %in% levels)) # if both borders are in maximum settings
ret <- cbind(ret, NA)
else
ret <- cbind(ret,levels[1]) # take first setting
}
colnames(ret) <- c("response", paste("x", seq_len(ncol(testdata)), sep=""))
return(ret)
}
expand.grid.subset <- function(subset, sequence, dimensions) {
# generate a list, with sequence for each dimension
vars <- list()
for(i in seq_len(dimensions)) {
vars[[i]] <- sequence
}
names(vars) <- paste("x", seq_len(dimensions), sep="")
# number of points in sequence grid
maximumSubset <- length(sequence)^dimensions
# from min(subset)) to min(maximumSubset, max(subset)) OR
# from maximumSubset to maximumSubset
subset <- min(maximumSubset,min(subset)):min(maximumSubset, max(subset))
#nv <- #length(vars)
# number of values per variable = length(sequence)
lims <- sapply(vars,length)
stopifnot(length(lims) > 0, # i.e. dimensions > 0
subset <= prod(lims), # i.e. subset <= maximumSubset
length(names(vars)) == dimensions) # i.e. dimensions = dimensions
# empty list of length names(vars)
res <- structure(vector("list",dimensions), .Names = names(vars))
if (dimensions > 1) {
for(i in dimensions:2) { # count down to 2: set up grid top down
# %% = mod, %/% = integer division
f <- prod(lims[seq_len(i-1)]) # number of grid points up to variable nr. (i-1)
# repeat each element on grid 1:f
res[[i]] <- vars[[i]][(subset - 1)%/%f + 1]
subset <- (subset - 1)%%f + 1
}
}
res[[1]] <- vars[[1]][subset]
as.data.frame(res)
}
typeCastParams <- function(params) {
ret_1 <- list()
ret_2 <- list()
ret <- list()
for(i in seq_along(params)) {
factor <- params[[i]]
if(length(factor) == 2) {
ret_1[[(length(ret_1)+1)]] <- factor
names(ret_1)[length(ret_1)] <- names(params)[i]
} else {
ret_2[[(length(ret_2)+1)]] <- factor
names(ret_2)[length(ret_2)] <- names(params)[i]
}
}
ret$to_optimize <- ret_1
ret$no_optimization <- ret_2
return(ret)
}
getCcdParameter <- function(params) {
lower_bounds <- unlist(lapply(X=params, FUN=function(x) x[1]))
higher_bounds <- unlist(lapply(X=params, FUN=function(x) x[2]))
steps <- (higher_bounds - lower_bounds)/2
# formula for each parameter, that transformes values from the range
# to [0, 1]
x <- paste("x", seq_along(params), " ~ (", c(names(params)), " - ",
(lower_bounds + steps), ")/", steps, sep="")
# list with single formulas as entries
formulae <- list()
for(i in seq_along(x))
formulae[[i]] <- as.formula(x[i])
design <- rsm::ccd(length(params), # number of variables
n0 = 1, # number of center points
alpha = "face", # position of the ‘star’ points
randomize = FALSE,
inscribed = TRUE, # TRUE: axis points are at +/- 1 and the
# cube points are at interior positions
coding = formulae) # List of coding formulas for the design
# variables
return(design)
}
combineParams <- function(params_1, params_2) {
len <- max(unlist(sapply(params_1, length)))
#num_params <- length(params_1)
p_names <- c(names(params_1), names(params_2))
matchedFilter <- "fwhm" %in% p_names
for(i in seq_along(params_2)) {
new_index <- length(params_1) + 1
fact <- params_2[[i]]
params_1[[new_index]] <- fact
if(matchedFilter) {
if(p_names[new_index] == "sigma" && fact == 0) {
# update values for sigma if zero
if("fwhm" %in% names(params_1)) {
params_1[[new_index]][seq_len(len)] <- params_1$fwhm/2.3548
} else {
params_1[[new_index]][seq_len(len)] <- params_2$fwhm/2.3548
}
} else if(p_names[new_index] == "mzdiff" && fact == 0) {
# update values for mzdiff if zero
if("step" %in% names(params_1)) {
if("steps" %in% names(params_1)) {
params_1[[new_index]][seq_len(len)] <- 0.8-params_1$step*params_1$steps
} else {
params_1[[new_index]][seq_len(len)] <- 0.8-params_1$step*params_2$steps
}
} else {
if("steps" %in% names(params_1)) {
params_1[[new_index]][seq_len(len)] <- 0.8-params_2$step*params_1$steps
} else {
params_1[[new_index]][seq_len(len)] <- 0.8-params_2$step*params_2$steps
}
}
} else {
# standard: replicate value
params_1[[new_index]][seq_len(len)] <- fact
}
} else {
# standard: replicate value
params_1[[new_index]][seq_len(len)] <- fact
}
}
names(params_1) <- p_names
return(params_1)
}
getRGTVValues <- function(mSet, exp_index=1, retcor_penalty=1) {
relative_rt_diff <- c();
peaks <- mSet@peakfilling$msFeatureData$chromPeaks;
#fgs <- mSet$FeatureGroupTable
fgs <- mSet@peakfilling$FeatureGroupTable
groups <- S4Vectors::as.matrix(fgs[, -ncol(fgs)])
rownames(groups) <- NULL
groupidx <- fgs$peakidx
if(nrow(groups) > 0) {
for(i in seq_len(nrow(groups))) {
feature_rtmed <- groups[i, "rtmed"]
relative_rt_diff <-
c(relative_rt_diff,
mean(abs(feature_rtmed -
peaks_IPO(mSet)[groupidx[[i]], "rt"]) / feature_rtmed))
}
good_groups <-
sum(unlist(lapply(X=groupidx, FUN = function(x, mSet) {
ifelse(length(unique(peaks_IPO(mSet)[x,"sample"])) ==
length(fileNames(mSet@rawInMemory)) &
length(peaks_IPO(mSet)[x,"sample"]) ==
length(fileNames(mSet@rawInMemory)), 1, 0)
}, mSet)))
bad_groups <- nrow(groups) - good_groups
} else {
relative_rt_diff <- 1
good_groups <- 0
bad_groups <- 0
}
tmp_good_groups <- good_groups + ifelse(bad_groups==0, 1, 0)
tmp_bad_groups <- bad_groups + ifelse(bad_groups==0, 1, 0)
ARTS <- (mean(relative_rt_diff)) * retcor_penalty
ret <- list(exp_index = exp_index,
good_groups = good_groups,
bad_groups = bad_groups,
GS = tmp_good_groups^2/tmp_bad_groups,
RCS = 1/ARTS)
ret$retcor_done = retcor_penalty
return(ret)
}
findIsotopes.IPO <- function(mSet, checkPeakShape=c("none", "borderIntensity",
"sinusCurve", "normalDistr")) {
checkPeakShape <- match.arg(checkPeakShape)
iso_mat <- matrix(0, nrow=0, ncol=2)
if(is.null(mSet)) {
return(iso_mat)
}
colnames(iso_mat) <- c("12C", "13C")
peak_source <- peaks_IPO(mSet)[,c("mz", "rt", "sample", "into", "maxo", "mzmin",
"mzmax", "rtmin", "rtmax"), drop=FALSE]
for(i in seq_len(ncol(peak_source))) {
peak_source <- peak_source[!is.na(peak_source[,i]),,drop=FALSE]
}
peak_source <- cbind(seq_len(nrow(peak_source)), peak_source)
colnames(peak_source)[1] <- "id"
#carbon = 12.0
#hydrogen = 1.0078250170
#CH3 = carbon + 3 * hydrogen
#CH2 = carbon + 2 * hydrogen
isotope_mass = 1.0033548
isotope_abundance = 0.01108
samples <- max(peak_source[,"sample"])
#start_sample
for(sample in seq_len(samples)) {
#only looking into peaks from current sample
speaks <- peak_source[peak_source[,"sample"]==sample,,drop=FALSE]
split <- 250
rawdata <- NULL;
if(!(checkPeakShape=="none"))
# rawdata <- loadRaw(xcmsSource(xset@filepaths[sample]))
stop("Other PeakShapeChecking Method will be supported later !")
if(nrow(speaks)>1) {
#speaks <- speaks[,-c("sample")]
speaks <- speaks[order(speaks[,"mz"]),]
while(!is.null(nrow(speaks)) & length(speaks) > 3) {
part_peaks <- NULL
#splitting the data into smaller pieces to improve speed
if(nrow(speaks) < split) {
part_peaks <- speaks
} else {
upper_bound <- speaks[split,"mzmax"] + isotope_mass
end_point <- sum(speaks[,"mz"] < upper_bound)
part_peaks <- speaks[seq_len(end_point),,drop=FALSE]
}
rt <- part_peaks[,"rt"]
rt_window <- rt * 0.005
rt_lower <- part_peaks[,"rt"] - rt_window
rt_upper <- part_peaks[,"rt"] + rt_window
rt_matrix <-
t(matrix(rep(rt, nrow(part_peaks)), ncol=nrow(part_peaks)))
rt_matrix_bool <- rt_matrix >= rt_lower & rt_matrix <= rt_upper
mz <- part_peaks[,"mz"]
#isotope_masses - mz_window
mz_lower <- part_peaks[,"mzmin"] + isotope_mass
#isotope_masses + mz_window
mz_upper <- part_peaks[,"mzmax"] + isotope_mass
mz_matrix <-
t(matrix(rep(mz, nrow(part_peaks)), ncol=nrow(part_peaks)))
mz_matrix_bool <- mz_matrix >= mz_lower & mz_matrix <= mz_upper
rt_mz_matrix_bool <- rt_matrix_bool & mz_matrix_bool
rt_mz_peak_ids <- which(rowSums(rt_mz_matrix_bool)>0)
calculations <- min(split, nrow(speaks))
rt_mz_peak_ids <- rt_mz_peak_ids[rt_mz_peak_ids < calculations]
for(i in rt_mz_peak_ids) {
current <- part_peaks[i, ,drop=FALSE]
rt_mz_peaks <- part_peaks[rt_mz_matrix_bool[i,],,drop=FALSE]
rt_difference <-
abs(current[,"rt"] - rt_mz_peaks[, "rt"]) / current[,"rt"]
rt_mz_peaks <- cbind(rt_mz_peaks, rt_difference)
#test intensity_window
#floor((current["mz"]-2*CH3)/CH2) + 2
maximum_carbon <- calcMaximumCarbon(current[,"mz"])
carbon_probabilty <- c(1,maximum_carbon)*isotope_abundance
iso_intensity <- current[,"into"] * carbon_probabilty
int_bools <-
rt_mz_peaks[,"into"] >= iso_intensity[1] &
rt_mz_peaks[,"into"] <= iso_intensity[2]
if(sum(int_bools) > 0) {
int_peaks <- rt_mz_peaks[int_bools,,drop=FALSE]
boundary_bool <- rep(TRUE, (nrow(int_peaks)+1))
if(!(checkPeakShape=="none")) {
if(checkPeakShape=="borderIntensity") {
boundary_bool <- checkIntensitiesAtRtBoundaries(
rawdata,
rbind(current,int_peaks[,-ncol(int_peaks), drop=FALSE]))
} else {
if(checkPeakShape=="sinusCurve") {
boundary_bool <- checkSinusDistribution(
rawdata,
rbind(current,int_peaks[,-ncol(int_peaks),drop=FALSE]))
} else {
boundary_bool <- checkNormalDistribution(
rawdata,
rbind(current,int_peaks[,-ncol(int_peaks),drop=FALSE]))
}
}
} #else {
#boundary_bool <- rep(TRUE, (nrow(int_peaks)+1))
#}
if(boundary_bool[1] & sum(boundary_bool[-1])>0) {
iso_peaks <- int_peaks[boundary_bool[-1],,drop=FALSE]
iso_id <-
iso_peaks[which.min(iso_peaks[,"rt_difference"]), "id"]
#iso_list[[length(iso_list)+1]] <- c(current[,"id"], iso_id)
iso_mat <- rbind(iso_mat, c(current[,"id"], iso_id))
}
}
}
speaks <- speaks[-(seq_len(calculations)),]
}#end_while_sample_peaks
}
}
return(iso_mat)
}
checkIntensitiesAtRtBoundaries <- function(rawdata, peaks,
minBoundaryToMaxo=1/3,ppmstep=15) {
ret <- rep(TRUE, nrow(peaks))
for(i in seq_len(nrow(peaks))) {
peak <- peaks[i,]
for(boundary in c("rtmin", "rtmax")) {
rtIndex <- which(rawdata$rt==peak[boundary])
if(length(rtIndex)>0) {
if(rtIndex==length(rawdata$scanindex)) {
rtIndices <- c(rawdata$scanindex[rtIndex], length(rawdata$mz))
} else {
rtIndices <- rawdata$scanindex[c(rtIndex, rtIndex+1)]
}
#only relevant mz and intensity values regarding retention time
mz <- rawdata$mz[(rtIndices[1]+1):rtIndices[2]]
intensities <- rawdata$intensity[(rtIndices[1]+1):rtIndices[2]]
ppm <- peak[c("mzmin", "mzmax")]*ppmstep/1000000
mzIntensities <-
c(0, intensities[mz>=peak["mzmin"]-ppm[1] & mz<=peak["mzmax"]+ppm[2]])
maxBoundaryIntensity <- max(mzIntensities)
ret[i] <- ret[i] & maxBoundaryIntensity<peak["maxo"]*minBoundaryToMaxo
}
}
}
return(ret)
}
checkSinusDistribution <- function(rawdata, peaks) {
ret <- rep(TRUE, nrow(peaks))
for(i in seq_len(nrow(peaks))) {
ret[i] <- testSinusDistribution(rawdata, peaks[i,,drop=FALSE])
}
return(ret)
}
checkNormalDistribution <- function(rawdata, peaks) {
ret <- rep(TRUE, nrow(peaks))
for(i in seq_len(nrow(peaks))) {
ret[i] <- testNormalDistribution(rawdata, peaks[i,,drop=FALSE])
}
return(ret)
}
getIntensitiesFromRawdata <- function(rawdata, peak) {
rt <- rawdata$rt >= peak[,"rtmin"] & rawdata$rt <= peak[,"rtmax"]
rtRange <- c(min(which(rt)), max(which(rt))+1)
scanIndices <-
rawdata$scanindex[rtRange[1]:min(rtRange[2], length(rawdata$scanindex))]
# scanIndices <- scanIndices[!is.na(scanIndices)]
if(rtRange[2]>length(rawdata$scanindex)) {
scanIndices <- c(scanIndices, length(rawdata$intensity))
}
if(length(scanIndices) < 3)
return(FALSE)
y <- c()
for(i in seq_len(length(scanIndices)-1)) {
scanRange <- c(scanIndices[i]+1, scanIndices[i+1])
mz <- rawdata$mz[scanRange[1]:scanRange[2]]
y <-
c(y,
max(0, (rawdata$intensity[scanRange[1]:scanRange[2]][
mz >= peak[,"mzmin"] & mz <= peak[,"mzmax"]])
)
)
}
y
}
#' @importFrom stats dnorm cor
testNormalDistribution <- function(rawdata, peak) {
y <- getIntensitiesFromRawdata(rawdata, peak)
if(length(y) < 3) {
return(FALSE)
}
if(max(y)==0) {
return(FALSE)
}
normY <- (y-min(y))/(max(y)-min(y))
mean=10;
sd=3;
seqModel <- seq(-4,4,length=length(normY))*sd + mean
yModel <- dnorm(seqModel,mean,sd)
yModel = yModel* (1/max(yModel))
correlation <- cor(yModel, normY)
correlation > 0.7
}
#' @importFrom stats cor
testSinusDistribution <- function(rawdata, peak) {
y <- getIntensitiesFromRawdata(rawdata, peak)
if(length(y) < 3) {
return(FALSE)
}
if(max(y)==0) {
return(FALSE)
}
normY <- (y-min(y))/(max(y)-min(y))
sinCurve <- (sin(seq(-pi/2,pi+1.5,length=length(normY))) + 1) / 2
correlation <- cor(sinCurve, normY)
correlation > 0.7
}
findIsotopes.CAMERA <- function(mSet, ...) {
# iso_mat <- matrix(0, nrow=0, ncol=2)
# if(is.null(mSet)) {
# return(iso_mat)
# }
#
# ids <- peaks_IPO(mSet)[,"sample", drop=FALSE]
# ids <- cbind(1:length(ids), ids)
#
# xsets <- split(mSet, unique(peaks_IPO(mSet)[,"sample"]))
# samples <- unique(peaks_IPO(mSet)[,"sample"])
# for(sample in samples) {
# an <- xsAnnotate(mSet, sample=sample)
# isos <- findIsotopes(an, ...)@isoID[,c("mpeak", "isopeak"), drop=FALSE]
# #start_id <- ids[ids[,2]==sample,,drop=FALSE][1,1] - 1
# iso_mat <- rbind(iso_mat, matrix(ids[ids[,2]==sample,1][isos], ncol=2))
# }
#
# iso_mat
}
#' @importFrom stats as.formula lm
createModel <- function(design, params, resp) {
# add response to the design, which gives the data for the model
design$resp <- resp
if(length(params) > 1) {
# create full second order (SO) model
# use xi in model, instead of parameter names
formula <-
as.formula(paste("resp ~ SO(",
paste("x", seq_along(params),
sep="", collapse=","),
")", sep=""))
model <- rsm::rsm(formula, data=design)
} else {
# create full second order model with one parameter
# here: use parameter name in model
param_name <- names(params)[1]
formula <- as.formula(paste("resp ~ ", param_name, " + ",
param_name, " ^ 2", sep=""))
model <- lm(formula, data=design)
model$coding <- list(x1=as.formula(paste(param_name, "~ x1")))
names(model$coding) <- param_name
#attr(model, "class") <- c("rsm", "lm")
}
return(model)
}
decode <- function(value, bounds) {
if(is.na(value))
value <- 1
x <- (value+1)/2 # from [-1,1] to [0, 1]
x <- (x*(max(bounds)-min(bounds))) + min(bounds)
return(x)
}
decodeAll <- function(values, params) {
ret <- rep(0, length(params))
for(i in seq_along(params))
ret[i] <- decode(values[i], params[[i]])
names(ret) <- names(params)
return(ret)
}
encode <- function(value, bounds) {
x <- (value - min(bounds)) / (max(bounds) - min(bounds))
return(x*2-1)
}
attachList <- function(params_1, params_2) {
params <- params_1
for(factor in params_2)
params[[length(params)+1]] <- factor
names(params) <- c(names(params_1), names(params_2))
return(params)
}
checkParams <- function(params, quantitative_parameters,
qualitative_parameters,
unsupported_parameters) {
if(length(typeCastParams(params)$to_optimize)==0) {
stop("No parameters for optimization specified; stopping!")
}
for(i in seq_along(params)) {
param <- params[[i]]
name <- names(params)[i]
if(name %in% unsupported_parameters) {
stop(c("The parameter ", name, " is not supported!
Please remove from parameters; stopping!"))
}
if(name %in% qualitative_parameters) {
if(length(param) == 0) {
stop(c("The parameter ", name, " has no value set! Please specify; stopping!"))
}
if(length(param) > 1) {
stop(c("Optimization of parameter ", name, " not supported!
Please specify only one value; stopping!"))
}
}
if(name %in% quantitative_parameters) {
if(length(param) == 0) {
stop(c("The parameter ", name, " has no value set!
Please specify between one and two; stopping!"))
}
if(length(param) > 2) {
stop(c("Too many values for parameter ", name, " !
Please specify only one or two; stopping!"))
}
if(!all(diff(param) > 0)) {
stop(c("Parameter ", name, " has wrong order!",
"Please specify in increasing order; stopping!"))
}
}
}
missing_params <-
which(!(c(quantitative_parameters, qualitative_parameters) %in%
names(params)))
if(length(missing_params > 0)) {
stop(paste("The parameter(s)",
paste(c(quantitative_parameters,
qualitative_parameters)[missing_params],
collapse=", "),
"is/are missing! Please specify; stopping!"))
}
}
#
##### -----------------========== Bottom of this function kit ======----------------######
###########--------------- ========= Function Kit from AutoTuner =========-----------------
#' @references McLean C (2020). Autotuner: Automated parameter
#' selection for untargeted metabolomics data processing
#' @references https://github.com/crmclean/Autotuner/
filterPeaksfromNoise <- function(matchedMasses) {
## Initializing variables for subsetting
if(length(matchedMasses$values) %% 2 == 0) {
list_length <- length(matchedMasses$values)/2
} else {
list_length <- as.integer(length(matchedMasses$values)/2) + 1
}
truePeaks <- vector("list", list_length)
truePeakIndex <- 1
lengthCounter <- 1
lastTrue <- matchedMasses$values[1]
## subsets things that have a second mass w/in user error
for(rleIndex in seq_along(matchedMasses$values)) {
start <- lengthCounter
if(lastTrue == TRUE) {
start <- start + 1
}
end <- lengthCounter + matchedMasses$lengths[rleIndex] - 1
if(isTRUE(matchedMasses$values[rleIndex])) {
end <- end + 1
lastTrue = TRUE
truePeaks[[truePeakIndex]] <- start:end
truePeakIndex <- truePeakIndex + 1
} else {
lastTrue = FALSE
if(!exists("no_match")) {
no_match <- start:end
} else {
no_match <- c(no_match, start:end)
}
}
lengthCounter <- lengthCounter + matchedMasses$lengths[rleIndex]
}
if(is.null(truePeaks[[list_length]])) {
truePeaks <- truePeaks[seq_len(list_length - 1)]
}
rm(list_length,truePeakIndex,lastTrue,lengthCounter,start,end,rleIndex)
return(list(no_match, truePeaks))
}
findTruePeaks <- function(truePeaks, sortedAllEIC) {
#
# initializing storage ----------------------------------------------------
ppmData <- list()
counter <- 1
# Checking if bin elements come from adj scans ----------------------------
for(i in seq_along(truePeaks)) {
pickedPeak <- truePeaks[[i]]
peakData <- sortedAllEIC[pickedPeak,]
peakData <- peakData[order(peakData$scan),]
# remove features that are duplicated. -------------------------------
if(nrow(peakData) == 1) {
next()
}
## checking to make sure features comes from adjacent scans
scanDiff <- diff(sort(unique(peakData$scan)))
## checking that binned peaks:
# are being picked up within consecutive scans
peakDists <- (length(unique(scanDiff)) == 1 && unique(scanDiff) == 1)
if(!peakDists) {
next()
}
## checking to see if any binned masses come from the same scan
countsInScans <- table(peakData$scan)
moreInAScan <- any(as.vector(countsInScans) > 1)
if(moreInAScan) {
for(w in seq_len(length(unique(peakData$scan)) - 1)) {
curScan <- unique(peakData$scan)[w]
nextScan <- unique(peakData$scan)[w+1]
### add condition here for whenever it is the end of the scan
scanStates <- peakData[peakData$scan == curScan,]
nextStates <- peakData[peakData$scan == nextScan,]
curObsRows <- peakData$scan == curScan | peakData$scan ==
nextScan
peakData <- peakData[!curObsRows,]
## do this if there are two states in first scan
if(w == 1) {
errorNext <- lapply(scanStates$mz, function(x) {
obsError <- abs(x - nextStates$mz)/x * 10^6
})
checkMins <- vapply(X = errorNext, FUN = which.min,
FUN.VALUE = numeric(1))
initialState <- which.min(vapply(X = seq_along(errorNext),
FUN = function(x) {
errorNext[[x]][checkMins[x]]
}, FUN.VALUE = numeric(1)))
scanStates <- scanStates[initialState,]
}
nextStateIndex <- which.max(vapply(X = scanStates$mz,
FUN = function(x) {
obsError <- abs(x - nextStates$mz)/x * 10^6
if(any(obsError == 0)) {
## corner case - error is 0 and there are no other
## options
if(length(x) == 1) {
obsError <- 0.001
} else {
obsError[obsError == 0] <-
min(obsError[obsError != 0])/10
}
}
intensityProb <- nextStates$intensity/
sum(nextStates$intensity)
errorInverse <- 1/obsError
nextStateProb <- errorInverse/sum(errorInverse) *
intensityProb
return(max(nextStateProb/sum(nextStateProb)))
}, FUN.VALUE = numeric(1)))
nextStates <- nextStates[nextStateIndex,]
## store new states
bestStates <- rbind(scanStates,nextStates)
peakData <- rbind(bestStates, peakData)
peakData <- peakData[order(peakData$scan),]
}
multipleInScan <- TRUE
} else {
multipleInScan <- FALSE
}
obsPPM <- vapply(X = 2:length(peakData$mz), FUN = function(mz) {
estimatePPM(peakData$mz[(mz - 1)], peakData$mz[mz])
}, FUN.VALUE = numeric(1))
# storing output -------------------------------------------------------
ppmData[[counter]] <- data.frame(meanMZ = mean(peakData$mz),
startScan = min(peakData$scan),
endScan = max(peakData$scan),
scanCount = length(peakData$scan),
Intensity = sum(peakData$intensity),
meanIntensity = mean(
peakData$intensity),
intensityDispersion = sd(
peakData$intensity),
minIntensity = min(peakData$intensity),
meanPPM = paste(signif(obsPPM),
collapse = ";"),
multipleInScan,
stringsAsFactors = FALSE,
index = i)
counter <- 1 + counter
}
approvedPeaks <- Reduce(rbind, ppmData)
return(approvedPeaks)
}
estimatePPM <- function(first, second) {
abs(first-second)/first * 10 ^ 6
}
#' estimateSNThresh
#' @noRd
#' @importFrom stats var
estimateSNThresh <- function(no_match, sortedAllEIC, approvedPeaks) {
noisePeakTable <- sortedAllEIC[no_match,]
noise_noise <- noisePeakTable$intensity
scanCount <- sortedAllEIC$scan
maxScan <- max(scanCount, na.rm = TRUE)
minScan <- min(scanCount, na.rm = TRUE)
scanCount <- scanCount[no_match]
## generating index for subseting of fixed noise obj
scanIntervals <- list()
for(peakID in seq_len(nrow(approvedPeaks))) {
peakStart <- approvedPeaks[peakID,"startScan"]
peakEnd <- approvedPeaks[peakID,"endScan"]
peakDist <- peakEnd - peakStart
lowerBound <- peakStart - peakDist * 2
if(lowerBound < minScan) {
lowerBound <- minScan
}
upperBound <- peakEnd + peakDist * 2
if(upperBound > maxScan) {
upperBound <- maxScan
}
scanIntervals[[peakID]] <- which(minScan:maxScan %in% c(lowerBound,
upperBound))
}
## calculating all fixed noise values
scanRange <- (minScan:maxScan)
fixedNoiseList <- list()
counter <- 1
for(scanID in seq_along(scanRange)) {
peakNoise <- noise_noise[scanCount == scanRange[scanID]]
if(length(peakNoise) == 0) {
next
}
fixedNoise <- peakNoise[!(peakNoise %in% boxplot.stats(peakNoise)$out)]
fixedNoiseMean <- mean(x = fixedNoise, na.rm = TRUE)
fixedNoiseVar <- stats::var(x = fixedNoise, na.rm = TRUE)
## 2019-07-08: fixed corner case where only one noise element was
## found within the bin
if(is.na(fixedNoiseVar)) {
fixedNoiseVar <- 0
}
N <- length(fixedNoise);
fixedNoiseList[[counter]] <- data.frame(fixedNoiseMean,fixedNoiseVar,N);
counter <- 1 + counter;
}
fixedNoiseList <- Reduce(rbind, fixedNoiseList);
## calculating the sd and mean for each group
noiseIntDb <- list()
counter <- 1
for(row in seq_along(scanIntervals)) {
if(row == 1) {
new <- TRUE;
} else {
new <- vapply(X = noiseIntDb, FUN = function(noiseDb) {
if(!all(scanIntervals[[row]] == as.numeric(noiseDb[,c(1,2)]))) {
return(TRUE)
} else {
return(FALSE)
}
}, FUN.VALUE = logical(1))
new <- all(new)
}
if(new) {
## add check to see if cur row has already been looked at
curRow <- scanIntervals[[row]]
curStatDb <- fixedNoiseList[curRow[1]:curRow[2],]
## added this to check for case when row is missing
## if a match to the noise peak was not identified
if(any(is.na(curStatDb$fixedNoiseMean))) {
curStatDb <- curStatDb[!is.na(curStatDb$fixedNoiseMean),]
}
eX2 <- sum((curStatDb$fixedNoiseMean^2 +
curStatDb$fixedNoiseVar)*curStatDb$N)
eX2 <- eX2/sum(curStatDb$N)
groupMean <- mean(curStatDb$fixedNoiseMean)
groupVar <- eX2 - groupMean^2
if(groupVar < 0) {
groupVar <- groupVar * -1
}
groupSd <- suppressWarnings(sqrt(groupVar))
noiseIntDb[[counter]] <- data.frame(start = curRow[1],
end = curRow[2], groupMean,
groupSd)
counter <- counter + 1
}
}
noiseIntDb <- Reduce(rbind, noiseIntDb);
noiseIntDb$key <- apply(noiseIntDb[,c(1,2)], 1, paste, collapse = " ");
rm(curStatDb, eX2, groupSd, groupVar, groupMean,
curRow, fixedNoiseList)
SN <- list()
counter <- 1
for(peakID in seq_along(scanIntervals)) {
scanInt <- paste(scanIntervals[[peakID]], collapse = " ")
scanStats <- noiseIntDb[noiseIntDb$key == scanInt,]
if(nrow(scanStats) == 0) {
next()
}
## 2019-06-20 - added here to fix bug if noise calc is wrong
if(is.nan(scanStats$groupSd) | is.na(scanStats$groupSd)) {
next()
}
if(is.nan(scanStats$groupMean)) {
next()
}
Peak <- approvedPeaks$Intensity[peakID]
if((Peak - scanStats$groupMean) > 3*scanStats$groupSd) {
## selects as true peak
SigNoiseThresh <- (Peak - scanStats$groupMean)/scanStats$groupSd
SN[[counter]] <- SigNoiseThresh
counter <- counter + 1
} else {
next()
}
}
if(!exists("SN")) {
return(NA)
}
return(unlist(SN))
}
#' filterPpmError
#' @noRd
#' @importFrom stats density kmeans
#' @importFrom entropy KL.empirical
#' @importFrom cluster clusGap
#' @importFrom grDevices boxplot.stats
filterPpmError <- function(approvedPeaks, useGap, varExpThresh,
returnPpmPlots, plotDir, observedPeak,
filename) {
ppmObs <- approvedPeaks$meanPPM
ppmObs <- unlist(lapply(strsplit(split = ";", x = as.character(ppmObs)),
function(x) {as.numeric(x)}))
## Only consider top 1/3 peaks because high quality signals alrady selected
ppmObs <- sort(ppmObs,decreasing = TRUE)[1:ceiling(length(ppmObs)/3)]
## 2019-06-19
## corner case when all error measurements are identical.
if(diff(range(ppmObs)) < .Machine$double.eps ^ 0.5) {
stop(c("All calculated ppm values are identical.",
"Mass variation of data may be higher than the mass threshold value."
))
}
#message("-------- Number of ppm value across bins: ", length(ppmObs))
#set.seed(161);
if(length(ppmObs) > 10000) {
set.seed(1141);
ppmObs <- ppmObs[sample(x = seq_along(ppmObs), size = 5000)]
}
if(length(ppmObs) > 750) {
checkPpm <- length(ppmObs)/2
subsample <- TRUE
while(subsample) {
origDist <- stats::density(ppmObs, bw = 1)$y
newDist1 <- stats::density(sample(ppmObs, checkPpm), bw = 1)$y
newDist2 <- stats::density(sample(ppmObs, checkPpm), bw = 1)$y
newDist3 <- stats::density(sample(ppmObs, checkPpm), bw = 1)$y
newDist4 <- stats::density(sample(ppmObs, checkPpm), bw = 1)$y
newDist5 <- stats::density(sample(ppmObs, checkPpm), bw = 1)$y
newDist6 <- stats::density(sample(ppmObs, checkPpm), bw = 1)$y
newDist7 <- stats::density(sample(ppmObs, checkPpm), bw = 1)$y
klDistance <- list()
subSamples <- ls()[grep("newDist",ls())]
for(j in seq_along(subSamples)) {
klDistance[[j]] <- suppressWarnings(entropy::KL.empirical(
origDist,get(subSamples[j])))
}
klDistance <- unlist(klDistance)
if(any(klDistance >= 0.5)) {
subsample <- FALSE
} else {
checkPpm <- checkPpm/2
}
}
ppmObs <- sample(ppmObs, checkPpm)
}
# ppmEsts <- bplapply(c(1:500),
# FUN = ppmEstCore,
# ppmObs = ppmObs,
# useGap = useGap, BPPARAM = MulticoreParam(4))
# ppmEsts <- unlist(ppmEsts)
# # ppmEsts <- vapply(c(1:250),
# # ppmEstCore,
# # FUN.VALUE = vector(mode = "double", length = 1),
# # ppmObs = ppmObs,
# # useGap = useGap)
#
# #save(ppmEsts, file = "ppmEsts.rda")
# ppmEsts <- sort(ppmEsts)[-c(1, 500)] # remove top extreme values
# ppmEst <- mean(ppmEsts) + sd(ppmEsts)*2 # increase to cover more
ppmEsts <- vapply(c(1:50),
ppmEstCore,
FUN.VALUE = vector(mode = "double", length = 1),
ppmObs = ppmObs,
useGap = useGap,
varExpThresh = varExpThresh)
ppmEsts <- sort(ppmEsts)[-c(1, 50)] # remove top extreme values
ppmEst <- mean(ppmEsts) + sd(ppmEsts)*2 # increase to cover more
return(ppmEst)
}
ppmEstCore <- function(xx, ppmObs,useGap, varExpThresh) {
if(length(ppmObs) < 100) {
#set.seed(xx)
kmeansPPM <- kmeans(ppmObs, 1)
# } else if(useGap) {
# set.seed(xx)
# gapStat <- cluster::clusGap(x = as.matrix(ppmObs),
# FUNcluster = kmeans,
# K.max = 5,
# B = 7,
# verbose = FALSE)
#
# gapStat <- gapStat$Tab;
# gap <- diff(-gapStat[,3]) > 0;
# if(any(gap)) {
# clusters <- max(which(gap)) + 1;
# } else {
# clusters <- 1;
# }
# set.seed(xx)
# kmeansPPM <- kmeans(ppmObs, clusters);
} else {
## estimating clustering based on hard coded 80% Vexp threshold
clustCount <- 1
varExp <- 0
while(varExp < varExpThresh && clustCount < length(ppmObs)/2) {
#set.seed(xx)
kmeansPPM <- kmeans(ppmObs, clustCount)
varExp <- kmeansPPM$betweenss/kmeansPPM$totss
clustCount <- clustCount + 1
}
}
## cluster which contains smallest ppm values
clusterSize <-sort(table(kmeansPPM$cluster),decreasing = TRUE)
maxCluster <- names(clusterSize)[1]
minCluster <- which(kmeansPPM$cluster == maxCluster)
rm(clusterSize)
x <- ppmObs
n <- length(x)
h <- 1
## delete this later
gauss <- function(x) 1/sqrt(2*pi) * exp(-(x^2)/2)
gaussDKE <- function(a, x) gauss((x - a)/h)/(n * h)
bumps <- vapply(X = ppmObs[minCluster], FUN = gaussDKE, x = x,
FUN.VALUE = numeric(length = length(x)))
wholeKDE <- vapply(X = ppmObs, FUN = gaussDKE,
x,
FUN.VALUE = numeric(length =
length(x)))
## calculating this ahead of time to avoid unnecessary downstream
## math
cKdeMean <- sum(rowSums(bumps))/length(minCluster)
OutlierScore <- rowSums(wholeKDE)/(cKdeMean)
scoreSub <- which(OutlierScore > 1)
ppmEst <- max(ppmObs[scoreSub])
maxX <- ppmEst
ppmEst <- ppmEst + sd(ppmObs[scoreSub])*3
return(ppmEst)
}
#' @importFrom stats smooth.spline
peakwidth_est <- function(peak_vector,
time,
intensity,
start = NULL,
end = NULL,
old_r2 = NULL) {
killSwitch <- FALSE
# check to make sure input values come from vector
if(!is.numeric(peak_vector)) {
warning(c("A non numeric vector was given to peakwidth_est().",
"This is incorrect. Check the function input."))
}
# updating data values to put into regression
if(!is.null(start)) { # case where we are in second + iteration of algorithm
end <- end + 1
} else { # case where the algorithm is run for the first time
peak_index <- which(time %in% peak_vector)
if(length(peak_index) == 0) {
stop("The peak entered here could not be matched to the chromatography data.")
}
start <- peak_index[1] - 1
end <- peak_index[length(peak_index)]
}
# terms for lm
points <- c(start,start-1,start-2,start-3,end,end+1,end+2,end+3)
intensityObs <- intensity[points]
modelIndex <- seq_along(intensityObs)
# correcting na formation within intensity observation vector
if(any(is.na(intensityObs))) {
naObsIndex <- which(is.na(intensityObs))
modelIndex <- modelIndex[-naObsIndex]
intensityObs <- intensityObs[-naObsIndex]
killSwitch <- TRUE
}
# running smoothing spline on the data
if(sum(!is.na(peak_vector)) > 1) {
splineOut <- smooth.spline(modelIndex, intensityObs)
splineObs <- splineOut$fit$coef
splineIndex <- seq_along(splineObs)
} else {
splineObs <- intensityObs
}
splineIndex <- seq_along(splineObs)
# running a linear model on the outcome of the spline
chrom_table <- data.frame(splineIndex, splineObs)
model <- lm(formula = splineObs ~ splineIndex, data = chrom_table)
new_r2 <- summary(model)$r.squared
# used for comparison with previous itterations
if(is.null(old_r2)) {
old_r2 <- 0
}
# recursive case - returns previously calculated model fit if improvement
if((old_r2 > new_r2 | old_r2 > .9) | killSwitch == TRUE) {
# make sure to return numerical index of fit - not the values being compared
peak_width <- c(peakStart = start-3, peakEnd = end+3)
return(peak_width)
} else {
peakwidth_est(peak_vector, time, intensity, start,
end, old_r2 = new_r2)
}
}
##### -----------------========== Bottom of this function kit ======----------------######
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.