#' @title Perform ROI Extraction from raw MS data (PerformDataTrimming)
#' @description This function performs the raw data trimming. This function will output
#' an trimmed MSnExp file to memory or hardisk according to the choice of users must
#' provide the data path for 'datapath', and optionally provide other corresponding parameters.
#' @param datapath Character, the path of the raw MS data files' or folder's path (.mzXML, .CDF and .mzML)
#' for parameters training.
#' @param mode Character, mode for data trimming to select the chraracteristic peaks.
#' Default is 'ssm'. Users could select random trimed according to mz value (mz_random) or
#' RT value (rt_random). Besides, specific peaks at certain mz (mz_specific) or
#' RT (rt_specific) could also be extracted. 'none' will not trim the data.
#' @param mz Numeric, mz value(s) for specific selection. Positive values means including (the values
#' indicted) and negative value means excluding/removing.
#' @param mzdiff Numeric, the deviation (ppm) of mz value(s).
#' @param rt Numeric, rt value for specific selection. Positive values means including
#' and negative value means excluding.
#' @param rtdiff Numeric, the deviation (seconds) of rt value(s).
#' @param rt.idx Numeric, the relative rt (retention time) range, from 0 to 1. 1 means all retention time
#' will be retained, while 0 means none. Default is 1/15. If default rt.idx produce too few peaks,
#' please consider increasing this value.
#' @param write Logical, if true, will write the trimmed data to the directory 'trimmed' folder
#' in the datapath. The data in memory will be kept.
#' @param plot Logical, if TRUE, will plot the chromatogram of the trimmed data.
#' @param rmConts LOgical, whether to exclude/remove the potential contamination for parameters optimization. Default is TRUE.
#' @param running.controller The resuming pipeline running controller. Optional. Don't need to define by hand.
#' @export
#' @import MSnbase
#' @import progress
#' @import Biobase
#' @import RColorBrewer
#' @import tools
#' @return will return an mSet objects with extracted ROI
#' @seealso \code{\link{PerformROIExtraction}} for the new version of this function.
#' @author Zhiqiang Pang \email{zhiqiang.pang@mail.mcgill.ca} Jeff Xia \email{jeff.xia@mcgill.ca}
#' Mcgill University
#' License: GNU GPL (>= 2)
#' @examples
#' DataFiles <- dir(system.file("mzData", package = "mtbls2"), full.names = TRUE, recursive = TRUE)
#' # mSet <- PerformDataTrimming(datapath = DataFiles[1],rt.idx = 0.025, rmConts = FALSE);
PerformDataTrimming<- function(datapath, mode="ssm", write=FALSE, mz, mzdiff, rt, rtdiff,
rt.idx=1/15, rmConts = TRUE, plot=TRUE, running.controller=NULL){
PerformROIExtraction(datapath, mode=mode, write, mz, mzdiff, rt, rtdiff,
rt.idx, rmConts = rmConts, plot,running.controller);
}
#' @title Perform ROI Extraction from raw MS data
#' @description This function performs the raw data trimming. This function will output
#' an trimmed MSnExp file to memory or hardisk according to the choice of users must
#' provide the data path for 'datapath', and optionally provide other corresponding parameters.
#' @param datapath Character, the path of the raw MS data files' or folder's path (.mzXML, .CDF and .mzML)
#' for parameters training.
#' @param mode Character, mode for data trimming to select the chraracteristic peaks.
#' Default is 'ssm'. Users could select random trimed according to mz value (mz_random) or
#' RT value (rt_random). Besides, specific peaks at certain mz (mz_specific) or
#' RT (rt_specific) could also be extracted. 'none' will not trim the data.
#' @param mz Numeric, mz value(s) for specific selection. Positive values means including (the values
#' indicted) and negative value means excluding/removing.
#' @param mzdiff Numeric, the deviation (ppm) of mz value(s).
#' @param rt Numeric, rt value for specific selection. Positive values means including
#' and negative value means excluding.
#' @param rtdiff Numeric, the deviation (seconds) of rt value(s).
#' @param rt.idx Numeric, the relative rt (retention time) range, from 0 to 1. 1 means all retention time
#' will be retained, while 0 means none. Default is 1/15. If default rt.idx produce too few peaks,
#' please consider increasing this value.
#' @param write Logical, if true, will write the trimmed data to the directory 'trimmed' folder
#' in the datapath. The data in memory will be kept.
#' @param plot Logical, if TRUE, will plot the chromatogram of the trimmed data.
#' @param rmConts LOgical, whether to exclude/remove the potential contamination for parameters optimization. Default is TRUE.
#' @param running.controller The resuming pipeline running controller. Optional. Don't need to define by hand.
#' @export
#' @import MSnbase
#' @import progress
#' @import Biobase
#' @import RColorBrewer
#' @import tools
#' @return will return an mSet objects with extracted ROI
#' @seealso \code{\link{PerformDataTrimming}} for the old version of this function.
#' @author Zhiqiang Pang \email{zhiqiang.pang@mail.mcgill.ca} Jeff Xia \email{jeff.xia@mcgill.ca}
#' Mcgill University
#' License: GNU GPL (>= 2)
#' @examples
#' DataFiles <- dir(system.file("mzData", package = "mtbls2"), full.names = TRUE, recursive = TRUE)
#' # mSet <- PerformROIExtraction(datapath = DataFiles[1],rt.idx = 0.025, rmConts = FALSE);
PerformROIExtraction <-
function(datapath,
mode = "ssm",
write = FALSE,
mz,
mzdiff,
rt,
rtdiff,
rt.idx = 1/15,
rmConts = TRUE,
plot = TRUE,
running.controller = NULL) {
if(!exists(".SwapEnv")){
.SwapEnv <<- new.env(parent = .GlobalEnv);
.SwapEnv$.optimize_switch <- TRUE;
.SwapEnv$count_current_sample <- 0;
.SwapEnv$count_total_sample <- 120; # maximum number for on.public.web
.SwapEnv$envir <- new.env();
}
.SwapEnv$.optimize_switch <- TRUE;
datapath <- normalizePath(datapath);
if(!.on.public.web()){
#Local R package running
files <- Path2Files(path = datapath);
files <- unlist(sapply(files, tools::file_path_as_absolute));
} else {
# code for web version
if (!dir.exists(datapath)) {
datapath <- "upload/";
}
if (!dir.exists(datapath)) {
datapath <- "/home/glassfish/projects/MetaboDemoRawData/upload/QC/";
}
files <- Path2Files(path = datapath);
files <- unlist(sapply(files, tools::file_path_as_absolute));
}
MessageOutput(
"Step 0/6: Scanning ROIs for parameters optimization...",
ecol = "\n",
progress = 1.0
)
match.arg(mode,
choices = c(
"ssm",
"mz_random",
"rt_random",
"mz_specific",
"rt_specific"
))
start.time <- Sys.time();
function.name <- "ROI_extract";
if (is.null(running.controller)) {
c1 <- c2 <- c3 <- c4 <- c5 <- TRUE;
.running.as.plan <- FALSE;
} else {
c1 <- running.controller@ROI_extract[["c1"]]; # Data read part
c2 <- running.controller@ROI_extract[["c2"]]; # Data trim part
c3 <- running.controller@ROI_extract[["c3"]]; # Data write part
c4 <- running.controller@ROI_extract[["c4"]]; # Data plot part
c5 <- running.controller@ROI_extract[["c5"]]; # Data rmConts part
.running.as.plan <- TRUE;
}
if (c1) {
#Select first at least 2 QC data for optimization
if (.on.public.web()) {
#TODO: need to configure with the online pipeline
mSet <- NULL;
load("mSet.rda");
dda_file <- files;
rawfilenms <- basename(mSet@rawfiles);
if (basename(datapath) == "QC") {
QC_uploaded_list <- basename(dda_file);
QC_index <- QC_uploaded_list %in% rawfilenms;
QC_count <- length(which(QC_index));
if (QC_count > 1) {
# deal with the case: 2 or more QC provided/included
if (QC_count > 2) {
dda_file1 <- dda_file[QC_index][seq_len(3)]
} else {
dda_file1 <- dda_file[QC_index]
}
} else if (QC_count == 1) {
# deal with the case: 1 QC provided/included
# List all samples uploaded
all_sample_list <-
list.files(paste0(datapath, "/../"),
recursive = TRUE,
full.names = TRUE)
# choose the included files
included_samples <-
all_sample_list[basename(all_sample_list) %in% rawfilenms]
file_sizes <-
sapply(
included_samples,
FUN = function(x) {
file.size(x)
}
)
largest_file <-
basename(names(sort(file_sizes, decreasing = TRUE)[seq_len(2)]))
sample_index <-
as.numeric(sapply(
unique(c(
basename(dda_file[QC_index]), largest_file
)),
FUN = function(x) {
grep(x, included_samples)
}
))
dda_file1 <- included_samples[sample_index]
} else {
# deal with the case: 0 QC provided/included
# List all samples uploaded
all_sample_list <-
list.files(paste0(datapath, "/../"),
recursive = TRUE,
full.names = TRUE)
# choose the largest files
included_samples <-
all_sample_list[basename(all_sample_list) %in% rawfilenms]
dda_file1 <-
names(sort(
sapply(
included_samples,
FUN = function(x) {
file.size(x)
}
),
decreasing = TRUE
)[seq_len(3)])
}
} else if (basename(datapath) == "upload") {
included_files <- dda_file[basename(dda_file) %in% rawfilenms]
QC_index <- grep("QC_*", included_files)
if (!identical(QC_index, integer(0)) &
length(QC_index) > 1) {
dda_file1 <- included_files[QC_index][seq_len(3)]
} else if (!identical(QC_index, integer(0)) &
length(QC_index) == 1) {
file_sizes <-
sapply(
included_files,
FUN = function(x) {
file.size(x)
}
)
dda_file1 <-
c(included_files[which(file_sizes == max(file_sizes))], included_files[QC_index])
} else {
file_sizes <-
sapply(
included_files,
FUN = function(x) {
file.size(x)
}
)
dda_file1 <-
names(sort(file_sizes, decreasing = TRUE)[seq_len(3)])
}
}
} else {
mSet <- new("mSet");
mSet@rawfiles <- dda_file1 <- files;
}
MessageOutput(paste0(basename(dda_file1), collapse = "\n"),"\n")
pd <-
data.frame(
sample_name = sub(
basename(dda_file1),
pattern = ".mzXML",
replacement = "",
fixed = TRUE
),
stringsAsFactors = FALSE
)
MessageOutput("Data Loading...", "\n", NULL)
raw_data <-
tryCatch(read.MSdata(
dda_file1,
pdata = new("NAnnotatedDataFrame", pd),
msLevel. = 1L,
mode = "inMemory"
), error = function(e) {e})
if (!is(raw_data,"MSnExp")) {
MessageOutput(
mes = paste0(
"<font color=\"red\">",
"\nERROR:",
raw_data$message,
"</font>"
),
ecol = "\n",
progress = 1
)
stop("EXCEPTION POINT CODE: RMS1")
}
MessageOutput("Data Loaded !", "\n", NULL)
if (.running.as.plan) {
cache.save(raw_data, paste0(function.name, "_c1"));
marker_record(paste0(function.name, "_c1"));
}
} else {
mSet <- new("mSet");
mSet@rawfiles <- files;
raw_data <- cache.read(function.name, "c1");
marker_record(paste0(function.name, "_c1"));
}
if (c2) {
if (!mode == "none") {
## Data Trim
# a <-
# suppressMessages(unlist(lapply(
# ls(raw_data@assayData),
# FUN = function(x) {
# unlockBinding(sym = x, env = raw_data@assayData)
# }
# )));
ms_list <-
sapply(
ls(raw_data@assayData),
FUN = function(x)
raw_data@assayData[[x]]@mz
);
MessageOutput("Empty Scan detecting...", "\n", NULL)
emptyScan <- vector()
for (i in seq_along(ms_list)) {
if (identical(ms_list[[i]], numeric(0))) {
emptyScan <- c(emptyScan, i)
}
}
if (!identical(emptyScan, logical(0))) {
MessageOutput(paste0("Empty Scans Found. Removing..."), "\n", NULL);
raw_data <- .emptyscan.remove(raw_data, ms_list);
} else {
MessageOutput(paste0("No Empty scan found !"), "\n", NULL);
}
MessageOutput("Identifying regions of interest (ROI)...", "\n", NULL);
if (missing(rt.idx)) {
rt.idx <- 0.9
# include the 90% RT range
}
if(.on.public.web()){
rmConts <- FALSE;
peakParams <- NULL;
tmp_mes <- try(suppressWarnings(load("params.rda")),silent = TRUE);
} else {
tmp_mes <- 0;
}
if(c5){
if(is(tmp_mes,"try-error") | rmConts){
raw_data <- ContaminatsRemoval(raw_data, ms_list);
raw_data <- .emptyscan.remove(raw_data, ms_list);
} else if (.on.public.web()) {
if(exists("peakParams")){
if(peakParams[["rmConts"]]){
raw_data <- ContaminatsRemoval(raw_data, ms_list);
raw_data <- .emptyscan.remove(raw_data, ms_list);
}
} else {
raw_data <- ContaminatsRemoval(raw_data, ms_list);
raw_data <- .emptyscan.remove(raw_data, ms_list);
}
}
if (.running.as.plan) {
cache.save(raw_data, paste0(function.name, "_c5"));
marker_record(paste0(function.name, "_c5"));
}
} else {
raw_data <- cache.read(function.name, "c5");
marker_record(paste0(function.name, "_c5"));
}
if (mode == "ssm") {
trimed_MSnExp <- ssm_trim(raw_data, ms_list, rt.idx = rt.idx);
}
if (mode == "mz_random") {
try(trimed_MSnExp <- mz.trim_random(raw_data, ms_list), silent = TRUE);
}
if (mode == "rt_random") {
try(trimed_MSnExp <- rt.trim_random(raw_data, ms_list), silent = TRUE)
}
if (mode == "mz_specific") {
trimed_MSnExp <- mz.trim_specific(raw_data, ms_list, mz, mzdiff = mzdiff)
}
if (mode == "rt_specific") {
trimed_MSnExp <- rt.trim_specific(raw_data, ms_list, rt, rtdiff = rtdiff)
}
# remove the empty scans in the ms data
trimed_MSnExp <- .emptyscan.remove(trimed_MSnExp, ms_list);
} else{
# keep the data without trimming.
trimed_MSnExp <- raw_data;
}
if (.running.as.plan) {
cache.save(trimed_MSnExp, paste0(function.name, "_c2"));
marker_record(paste0(function.name, "_c2"));
}
} else {
trimed_MSnExp <- cache.read(function.name, "c2");
marker_record(paste0(function.name, "_c2"));
}
MessageOutput(NULL, NULL, 4);
if (c3) {
if (write == TRUE) {
MessageOutput("Data Writing...",ecol = "\n",NULL);
writenames <-
paste0(datapath,
"/trimmed/Trimmed_",
pd$sample_name,
".mzML",
sep = "")
dir.create(paste0(datapath, "/trimmed", collapse = ""))
suppressMessages(writeMSData(trimed_MSnExp, writenames, outformat = "mzml"))
MessageOutput("Data Writing Finished !",ecol = "\n",NULL);
}
if (.running.as.plan) {
#cache.save(trimed_MSnExp,paste0(function.name,"_c2"));
marker_record(paste0(function.name, "_c3"));
}
}
if (c4) {
if (plot == TRUE) {
MessageOutput("Chromatogram Plotting Begin...",ecol = "\n",NULL);
# if (.on.public.web()) {
# load_RColorBrewer();
# }
ch.xdata <- chromatogram(trimed_MSnExp)
group.col <-
paste0(suppressWarnings(brewer.pal(length(
trimed_MSnExp@processingData@files
), "Blues")));
plot(ch.xdata, col = group.col[seq_along(trimed_MSnExp@processingData@files)])
}
if (.running.as.plan) {
#cache.save(trimed_MSnExp,paste0(function.name,"_c2"));
marker_record(paste0(function.name, "_c4"));
}
}
MessageOutput(paste0("Identification on ROIs Finished!"),
ecol = "\n",
NULL);
if(.running.as.plan){
MessageOutput("Optimization will be started soon...", "\n", NULL);
}
mSet@rawInMemory <- trimed_MSnExp;
if(.on.public.web()){
save(mSet, file = "mSet.rda");
}
return(mSet)
}
#' @title Standards Simulation Method
#' @description Whole mass spectra will be divided as 4 bins according to the mz range. Trimming
#' the raw with slide window method in every bins and retained the windows with highest scan intensity
#' and remove other scan signal in mz dimension. Then the data will be trimed again in the RT dimension
#' with slide window method. The window with highest intensity scans will be kept. After the timming
#' alongside mz and RT dimension, the peaks not only the high intensity peaks, but also the relatively
#' low intensity peaks will also be retained as the 'simulated standards' data for parameters optimization.
#' @param raw_data MSnExp object, the raw data that has been read in memory.
#' @param ms_list List, the names list of all scans.
#' @param rt.idx Numeric, the retention time percentage, from 0 to 1. Default is 1/15.
#' @noRd
#' @import progress
#' @import BiocParallel
#' @import Biobase
#' @author Zhiqiang Pang \email{zhiqiang.pang@mail.mcgill.ca} Jeff Xia \email{jeff.xia@mcgill.ca}
#' Mcgill University
#' License: GNU GPL (>= 2)
ssm_trim <- function(raw_data, ms_list, rt.idx){
# mzdata_points<-unique(unlist(ms_list))
# Slide window to choose the high abundance bins in every districts
MessageOutput("MS data Preparing...", "\n", NULL);
#if(length(names(ms_list))>1000){
# ms_list_s<-sort(sample(names(ms_list),1000))
#} else {
# ms_list_s<-sort(names(ms_list))
#}
# Update ms_lsit
empty_toRemove <- which(sapply(names(ms_list),function(x){is.null(raw_data@assayData[[x]])}));
if(length(empty_toRemove)){
ms_list <- ms_list[-empty_toRemove];
}
spectra_mz <- unlist(lapply(MSnbase::spectra(raw_data),mz))
highestmz<-max(spectra_mz)
lowestmz<-min(spectra_mz)
# Split MS into 5 large districts
bins.boundary<-seq(from=lowestmz, to= highestmz,length.out = 5)
if (length(spectra_mz)>1000000){
# set.seed(222);
# rannum<- sort(sample(length(spectra_mz),
# ceiling(length(spectra_mz)/50)),
# decreasing = FALSE)
rannum <- seq(from = 1,
to = length(spectra_mz),
by = 50)
} else if (length(spectra_mz)>100000){
# set.seed(222)
# rannum<-sort(sample(length(spectra_mz),
# ceiling(length(spectra_mz)/5)),
# decreasing = FALSE)
rannum <- seq(from = 1,
to = length(spectra_mz),
by = 5)
} else {
rannum<-seq(length(spectra_mz))
}
spectra_abundance <- unlist(lapply(MSnbase::spectra(raw_data),intensity))[rannum]
spectra_mz <- spectra_mz[rannum]
spectra_mz_set <- sapply(seq_len(4),FUN=function(ii){
spectra_mz[spectra_mz > bins.boundary[ii] & spectra_mz < bins.boundary[ii+1]]
})
spectra_abundance_set <- sapply(seq_len(4),FUN=function(ii){
spectra_abundance[spectra_mz > bins.boundary[ii] & spectra_mz < bins.boundary[ii+1]]
})
rm(spectra_abundance); rm(spectra_mz)
good.bins.list<-bplapply(seq_len(4),
FUN = function(i, spectra_abundance_set, spectra_mz_set, bins.boundary){
binsb.low<-bins.boundary[i];
binsb.high<-bins.boundary[i+1];
w<-1;
bins.width<-(binsb.high-binsb.low)*0.1;
wind.up<-wind.down<-0;
inten.sum.set<-numeric();
while (!wind.up>binsb.high) {
wind.down<-binsb.low+(w-1);
wind.up<-binsb.low+bins.width+(w-1);
#inten.sum<-numeric()
inten.sum.set<-c(inten.sum.set,
sum (spectra_abundance_set[[i]]
[spectra_mz_set[[i]] > wind.down & spectra_mz_set[[i]] < wind.up]));
#for (j in 1:length(ms_list_s)){
# mz.set<-raw_data@assayData[[ms_list_s[j]]]@mz
# a1<-which(sapply(mz.set, FUN=function(x){x>wind.down && x<wind.up}))
# inten.sum<-c(inten.sum,sum(raw_data@assayData[[ms_list_s[j]]]@intensity[a1]))
#}
#inten.sum.set<-c(inten.sum.set,sum(inten.sum))
w<-w+1;
}
selected.bin.down<-binsb.low+which(max(inten.sum.set)==inten.sum.set)-1;
selected.bin.up<-binsb.low+which(max(inten.sum.set)==inten.sum.set)-1+bins.width
return(list(selected.bin.down,selected.bin.up))
},
spectra_abundance_set=spectra_abundance_set,
spectra_mz_set=spectra_mz_set,
bins.boundary=bins.boundary,
BPPARAM = MulticoreParam(4L))
MessageOutput("MS Data ready !", "\n", NULL);
# Remove the peaks out of the bins
MessageOutput("Identifying ROIs in m/z dimensions...", "\n", NULL);
pb <- progress_bar$new(format = "ROIs Identification in MZ Dimension [:bar] :percent Time left: :eta", total = length(ms_list), clear = TRUE, width= 80)
for (i in seq_along(ms_list)){
pb$tick();
ms.set<-raw_data@assayData[[names(ms_list)[i]]]@mz
k<-which(sapply(ms.set,FUN = function(x){(x > good.bins.list[[1]][[1]] && x < good.bins.list[[1]][[2]]) |
(x > good.bins.list[[2]][[1]] && x < good.bins.list[[2]][[2]]) |
(x > good.bins.list[[3]][[1]] && x < good.bins.list[[3]][[2]]) |
(x > good.bins.list[[4]][[1]] && x < good.bins.list[[4]][[2]])}))
raw_data@assayData[[names(ms_list)[i]]]@mz<-raw_data@assayData[[names(ms_list)[i]]]@mz[k];
raw_data@assayData[[names(ms_list)[i]]]@intensity<-raw_data@assayData[[names(ms_list)[i]]]@intensity[k];
raw_data@assayData[[names(ms_list)[i]]]@tic<-sum(raw_data@assayData[[names(ms_list)[i]]]@intensity);
raw_data@assayData[[names(ms_list)[i]]]@peaksCount<-length(raw_data@assayData[[names(ms_list)[i]]]@mz)
}
MessageOutput("Identifying ROIs in m/z dimensions Done !", "\n", NULL);
## Trimed data outside the RT bins
rt_set<-sapply(names(ms_list),FUN=function(x) raw_data@assayData[[x]]@rt)
rt.bin.width<-(max(rt_set)-min(rt_set))*rt.idx
# Slide window to determin the location of the RT trim boundary
tic<-1;w<-0;rt.window.min<-min(rt_set);tic.sum<-numeric()
while (!rt.window.min>max(rt_set)) {
rt.window.min<-min(rt_set)+w*0.75
rt.window.max<-min(rt_set)+rt.bin.width+w*0.75
rt.name.set<-names(which(sapply(rt_set,FUN = function(x){x>rt.window.min && x<rt.window.max})))
if(!identical(rt.name.set,character(0))){
tic.sum<-c(tic.sum,sum(sapply(rt.name.set,FUN=function(x){raw_data@assayData[[x]]@tic})))
}
w<-w+1
}
rt.boundary.lowlimit<-min(rt_set)+which(max(tic.sum)==tic.sum)[ceiling(length(which(max(tic.sum)==tic.sum))/2)]*0.75
rt.boundary.uplimit<-min(rt_set)+which(max(tic.sum)==tic.sum)[ceiling(length(which(max(tic.sum)==tic.sum))/2)]*0.75+rt.bin.width
MessageOutput("Identifying ROIs in RT dimensions...", "\n", NULL);
pb <- progress_bar$new(format = "ROIs Identification in RT Dimension [:bar] :percent Time left: :eta", total = length(ms_list), clear = TRUE, width= 80)
ncount<-numeric();
for (w in seq_along(ms_list)){
pb$tick();
if (raw_data@assayData[[names(ms_list)[w]]]@rt>rt.boundary.lowlimit && raw_data@assayData[[names(ms_list)[w]]]@rt<rt.boundary.uplimit){
ncount<-c(ncount,w)
}
}
for (j in seq_along(ms_list)){
if (!(j %in% ncount)){
raw_data@assayData[[names(ms_list)[j]]]@mz<-raw_data@assayData[[names(ms_list)[j]]]@intensity<-as.double();
raw_data@assayData[[names(ms_list)[j]]]@peaksCount<-as.integer(0);
raw_data@assayData[[names(ms_list)[j]]]@tic<-as.double(0);
}
}
return(raw_data)
}
#' @title Data trimming Method Based on Random MS
#' @description Trim raw data scan signal randomly in the mz dimension.
#' @param raw_data MSnExp object, the raw data that has been read in memory.
#' @param ms_list List, the names list of all scans.
#' @noRd
#' @import progress
#' @author Zhiqiang Pang \email{zhiqiang.pang@mail.mcgill.ca} Jeff Xia \email{jeff.xia@mcgill.ca}
#' Mcgill University
#' License: GNU GPL (>= 2)
mz.trim_random <- function(raw_data, ms_list){
mzdata_points <- unique(unlist(ms_list))
highestmz <- max(mzdata_points)
lowestmz <- min(mzdata_points)
# Randomly select 100 mz center points
#set.seed(1233);
mzdata_points <- sample(unique(mzdata_points),size = 100)
pb <- progress_bar$new(format = "Data Trimming [:bar] :percent Time left: :eta", total = length(raw_data@assayData), clear = TRUE, width= 60)
for (i in seq_along(raw_data@assayData)){
pb$tick();
mz.data<-ms_list[[i]];
remove.num.set<-numeric();k<-1
for (j in seq_along(mz.data)){
if (!(TRUE %in% (abs(mz.data[j]-mzdata_points)<=(highestmz-lowestmz)/10000))){
remove.num.set[k]<-j;
k<-k+1
}
}
raw_data@assayData[[names(ms_list[i])]]@mz<-mz.data[-remove.num.set];
raw_data@assayData[[names(ms_list[i])]]@intensity<-raw_data@assayData[[names(ms_list[i])]]@intensity[-remove.num.set];
raw_data@assayData[[names(ms_list[i])]]@tic<-sum(raw_data@assayData[[names(ms_list[i])]]@intensity);
raw_data@assayData[[names(ms_list[i])]]@peaksCount<-length(raw_data@assayData[[names(ms_list[i])]]@mz);
}
return(raw_data)
}
#' @title Data trimming Method Based on Random RT
#' @description Trim raw data scan signal randomly in the RT dimension.
#' @param raw_data MSnExp object, the raw data that has been read in memory.
#' @param ms_list List, the names list of all scans.
#' @noRd
#' @import progress
#' @author Zhiqiang Pang \email{zhiqiang.pang@mail.mcgill.ca} Jeff Xia \email{jeff.xia@mcgill.ca}
#' Mcgill University
#' License: GNU GPL (>= 2)
rt.trim_random <- function(raw_data, ms_list){
rt_set<-sapply(names(ms_list),FUN=function(x) raw_data@assayData[[x]]@rt)
rt_begin<-min(rt_set);
rt_end<-max(rt_set);
xn2<-character();xn3<-xn<-xcount<-ncount<-rt_var_sum<-numeric();
rt_width_aver<-(rt_end-rt_begin)/200
xn_list<-list()
pb <- progress_bar$new(format = "Data Trimming [:bar] :percent Time left: :eta", total = 200, clear = TRUE, width= 60)
for (w in seq_len(200)){
pb$tick();
rt_var<-rt_begin+rt_width_aver*(w-0.5);
rt_var_sum<-c(rt_var_sum,rt_var);
xn_list[w][[1]]<-0
}
xn<-sapply(names(ms_list),FUN= function(x) which(abs(raw_data@assayData[[x]]@rt-rt_var_sum)<=(rt_width_aver/2+0.000001)))
for (i in seq_along(xn)){xn_list[[xn[i]]]<-sum(xn_list[[xn[i]]],raw_data@assayData[[names(xn[i])]]@peaksCount)}
rt_peakcounts<-unlist(xn_list)
#rtslots<-rt_var_sum[sapply(tail(sort(unlist(xn_list)),10),FUN = function(x){which(x==rt_peakcounts)})]
# randomly select 10 RT slots
#set.seed(1244)
rtslots<-rt_var_sum[sapply(sample(unlist(xn_list),10),
FUN = function(x){which(x==rt_peakcounts)})]
for (j in seq_along(ms_list)){
if (TRUE %in% (abs(raw_data@assayData[[names(ms_list)[j]]]@rt-rtslots)<=(rt_width_aver/2+0.000001))){
xn2<-c(xn2,names(ms_list)[j]);xn3<-c(xn3,j)
}
}
for (k in seq_along(ms_list)) {
if (!(k %in% xn3)){
raw_data@assayData[[names(ms_list)[k]]]@peaksCount<-as.integer(0);
raw_data@assayData[[names(ms_list)[k]]]@mz<-as.double();
raw_data@assayData[[names(ms_list)[k]]]@intensity<-as.double();
raw_data@assayData[[names(ms_list)[k]]]@tic<-as.double(0);
}
}
return(raw_data)
}
#' @title Data trimming Method Based on Specific MS
#' @description Trim data based on specific mz values. Positive values will be specially retained,
#' while the negative values will be removed.
#' @param raw_data MSnExp object, the raw data that has been read in memory.
#' @param ms_list List, the names list of all scans.
#' @param mz Numeric, the specifric mz value that will be kept or removed.
#' @param mzdiff Numeric, the deviation (ppm) for the 'mz' values. Default is 100.
#' @noRd
#' @import progress
#' @author Zhiqiang Pang \email{zhiqiang.pang@mail.mcgill.ca} Jeff Xia \email{jeff.xia@mcgill.ca}
#' Mcgill University
#' License: GNU GPL (>= 2)
mz.trim_specific<-function(raw_data, ms_list, mz, mzdiff=100){
#mz<-c(72.323,100,120,240,360,480,720,780.524,1080);
if (missing(mz)){
stop("'mz' must be provided for mz_specific mode as a numeric vector!")
}
if (missing(mzdiff)){
stop("'mzdiff' must be provided for mz_specific mode as a numeric value!")
}
if(length(mz) == 0){
return(raw_data)
}
if (mz[1] == 0 | mzdiff==0){
stop("'mz' or 'mzdiff' cannot be zero!")
}
mz.neg<-mz[which(mz<0)];
mz.pos<-mz[which(mz>0)];
if (!identical(mz.pos,numeric(0))){
pb <- progress_bar$new(format = "Data Trimming_keeping [:bar] :percent Time left: :eta", total = length(ms_list), clear = TRUE, width= 60)
for (w in seq_along(ms_list)){
pb$tick();
if(is.null(raw_data@assayData[[names(ms_list)[w]]])){next}
xn<-unlist(sapply(mz.pos,FUN=function(x){which((abs(x-raw_data@assayData[[names(ms_list)[w]]]@mz)/(x)*1000000)<=mzdiff)}));
raw_data@assayData[[names(ms_list)[w]]]@mz<-raw_data@assayData[[names(ms_list)[w]]]@mz[xn];
raw_data@assayData[[names(ms_list)[w]]]@intensity<-raw_data@assayData[[names(ms_list)[w]]]@intensity[xn];
raw_data@assayData[[names(ms_list)[w]]]@peaksCount<-length(raw_data@assayData[[names(ms_list)[w]]]@mz);
raw_data@assayData[[names(ms_list)[w]]]@tic<-sum(raw_data@assayData[[names(ms_list)[w]]]@intensity);
}
for (w in seq_along(ms_list)){
if (identical(raw_data@assayData[[names(ms_list)[w]]]@mz,numeric(0))){
raw_data@assayData[[names(ms_list)[w]]]@mz<-as.double();
raw_data@assayData[[names(ms_list)[w]]]@intensity<-as.double();
}
}
}
if (!identical(mz.neg,numeric(0))){
pb <- progress_bar$new(format = "Data Trimming_removing [:bar] :percent Time left: :eta", total = length(ms_list), clear = TRUE, width= 60)
for (w in seq_along(ms_list)){
pb$tick();
if(is.null(raw_data@assayData[[names(ms_list)[w]]])){next}
xn<-unlist(sapply(abs(mz.neg),FUN=function(x){which((abs(x-raw_data@assayData[[names(ms_list)[w]]]@mz)/(x)*1000000)<=mzdiff)}));
if (!identical(xn, numeric(0))){
raw_data@assayData[[names(ms_list)[w]]]@mz<-raw_data@assayData[[names(ms_list)[w]]]@mz[-xn];
raw_data@assayData[[names(ms_list)[w]]]@intensity<-raw_data@assayData[[names(ms_list)[w]]]@intensity[-xn];
raw_data@assayData[[names(ms_list)[w]]]@peaksCount<-length(raw_data@assayData[[names(ms_list)[w]]]@mz);
raw_data@assayData[[names(ms_list)[w]]]@tic<-sum(raw_data@assayData[[names(ms_list)[w]]]@intensity);
}
}
for (w in seq_along(ms_list)){
if(is.null(raw_data@assayData[[names(ms_list)[w]]])){next}
if (identical(raw_data@assayData[[names(ms_list)[w]]]@mz,numeric(0))){
raw_data@assayData[[names(ms_list)[w]]]@mz<-as.double();
raw_data@assayData[[names(ms_list)[w]]]@intensity<-as.double();
}
}
}
return(raw_data)
}
#' @title Data trimming Method Based on Specific RT
#' @description Trim data based on specific RT values. Positive values will be specially retained,
#' while the negative values will be removed.
#' @param raw_data MSnExp object, the raw data that has been read in memory.
#' @param ms_list List, the names list of all scans.
#' @param rt Numeric, the specifric RT value that will be kept or removed.
#' @param rtdiff Numeric, the deviation (ppm) for the 'rt' values. Default is 100.
#' @noRd
#' @import progress
#' @author Zhiqiang Pang \email{zhiqiang.pang@mail.mcgill.ca} Jeff Xia \email{jeff.xia@mcgill.ca}
#' Mcgill University
#' License: GNU GPL (>= 2)
rt.trim_specific<-function(raw_data,ms_list,rt,rtdiff=10){
if (missing(rt)){
stop("'rt' must be provided for rt_specific mode with a vector !")
}
if (missing(rtdiff)){
stop("'rtdiff' must be provided for rt_specific mode with a numeric !")
}
if (rt[1] == 0 | rtdiff==0){
stop("'rt' or 'rtdiff' can not be zero !")
}
if (rt[1] > 0){
pb <- progress_bar$new(format = "Data Trimming [:bar] :percent Time left: :eta", total = length(ms_list), clear = TRUE, width= 60)
ncount<-numeric();
for (w in seq_along(ms_list)){
pb$tick();
if (TRUE %in% (abs(raw_data@assayData[[names(ms_list)[w]]]@rt-rt)<=rtdiff)){
ncount<-c(ncount,w)
}
}
for (j in seq_along(ms_list)){
if (!(j %in% ncount)){
raw_data@assayData[[names(ms_list)[j]]]@mz<-raw_data@assayData[[names(ms_list)[j]]]@intensity<-as.double();
raw_data@assayData[[names(ms_list)[j]]]@peaksCount<-as.integer(0);
raw_data@assayData[[names(ms_list)[j]]]@tic<-as.double(0);
}
}
} else {
pb <- progress_bar$new(format = "Data Trimming [:bar] :percent Time left: :eta", total = length(ms_list), clear = TRUE, width= 60)
ncount<-numeric();
for (w in seq_along(ms_list)){
pb$tick();
if (TRUE %in% (abs(raw_data@assayData[[names(ms_list)[w]]]@rt-abs(rt))<=rtdiff)){
ncount<-c(ncount,w)
}
}
for (j in seq_along(ms_list)){
if (j %in% ncount){
raw_data@assayData[[names(ms_list)[j]]]@mz<-raw_data@assayData[[names(ms_list)[j]]]@intensity<-as.double();
raw_data@assayData[[names(ms_list)[j]]]@peaksCount<-as.integer(0);
raw_data@assayData[[names(ms_list)[j]]]@tic<-as.double(0);
}
}
}
return(raw_data)
}
#' @title Function for 'Empty scan' removal
#' @description Function for 'Empty scan' removal (internal use only)
#' @noRd
#' @author Zhiqiang Pang \email{zhiqiang.pang@mail.mcgill.ca}
.emptyscan.remove<-function(raw_data, ms_list){
name.set<-sort(names(raw_data@assayData))
sample.set<-unique(sapply(seq_along(name.set),FUN=function(x){sub(".[S][0-9]+","",name.set[x])}))
assayData<-list();assayData1<-new.env();ki<-k<-1
df.data0<-raw_data@featureData@data;
df.data<-data.frame();
if(length(sample.set) > 9){
nchar.num<-nchar(names(ms_list)[1])-5
} else if (length(sample.set) > 99) {
nchar.num<-nchar(names(ms_list)[1])-6
} else {
nchar.num<-nchar(names(ms_list)[1])-4
}
suppressWarnings(for (i in seq_along(raw_data@assayData)){
if (!raw_data@assayData[[name.set[i]]]@tic==0){
if (!k==1){
if (!sample.set[which(sub(".[S][0-9]+","",name.set[i])==sample.set)]==sub(".[S][0-9]+","",names(assayData)[ki-1])){
k<-1
}
}
name0<-paste0(sample.set[which(sub(".[S][0-9]+","",name.set[i])==sample.set)],
".S",formatC(k, width=nchar.num, digits = nchar.num, flag="0"))
assayData[name0]<-raw_data@assayData[[name.set[i]]];
assayData[[name0]]@acquisitionNum<-as.integer(k);
assayData[[name0]]@scanIndex<-as.integer(k);
df.data[ki,1]<-k;
row.names(df.data)[ki]<-name0;
k<-k+1;ki<-ki+1;
}
})
list2env(assayData,envir = assayData1)
raw_data@assayData<-assayData1
names(df.data)<-"spectrum"
metaData<-raw_data@featureData@varMetadata;
featureData<-AnnotatedDataFrame(data=df.data, varMetadata=metaData);
raw_data@featureData<-featureData;
return(raw_data)
}
#' @title Function MS Generation
#' @description Output the MS data. This function will generate .mzML MS data in the working dirctory.
#' @param raw_data MS data in R environment with "MSnExp" class.
#' @noRd
#' @import MSnbase
#' @author Zhiqiang Pang \email{zhiqiang.pang@mail.mcgill.ca} Jeff Xia \email{jeff.xia@mcgill.ca}
#' Mcgill University
#' License: GNU GPL (>= 2)
PerformMSDataOutput<-function(raw_data){
message("Data Writing...")
writenames<-paste0("new_",raw_data@phenoData@data[["sample_name"]],sep = "")
#dir.create(paste0(datapath,"/trimed",collapse = ""))
suppressMessages(writeMSData(raw_data, writenames, outformat = "mzML"))
message("Output Data:","\n");
message(writenames)
message(c("Depositing Folder: ",getwd()))
message("Data Writing Finished !")
}
#' @title Function for 3D ms plotting
#' @description Function for 3D ms plotting (internal use only)
#' @importFrom lattice cloud
#' @author Zhiqiang Pang \email{zhiqiang.pang@mail.mcgill.ca}
#' @noRd
plot.MS_3D<-function(object) {
# if (.on.public.web()){load_lattice()};
#
dd <- as(object, "data.frame")
dd$rt <- dd$rt*60;
ms <- NULL ## get rid of 'no visible global function definition' note
par.set <- list(box.3d = list(lwd=.2))
cloud(intensity ~ mz + rt , data = dd,
type="h",
scales= list(
arrows=FALSE,
cex=.65,
draw=TRUE),
aspect=c(.8, 1),
group = ms,
zoom = 1,
par.settings = par.set,
axis.line = list(col = "transparent"),
xlab="m/z", ylab="Retention time (sec)", zlab=NULL)
}
#' @noRd
ContaminatsRemoval <- function(raw_data, ms_list){
scan_names <- sort(names(raw_data@assayData));
fre_stats <- plyr::count(round(unname(unlist(ms_list)), 2))
topScan_stats <- fre_stats[order(fre_stats$freq, decreasing = TRUE),][seq_len(300),] # get top 300 most frequent mz value
scan_totalLength <- length(raw_data@assayData)
top_mzs <- topScan_stats$x
topScan_stats[,4] <- topScan_stats[,3] <- 0;
colnames(topScan_stats)[c(3,4)] <- c("RT_ratio", "mz_mean");
MessageOutput("Identifying regions of potential contaminants", "", NULL);
k_count <- 0;
for(i in seq_len(80)){
mzrange <- c(top_mzs[i]-0.005, top_mzs[i]+0.005);
RawData <- suppressWarnings(MSnbase::filterMz(raw_data, mzrange));
scan_count <- 0;
mz_vec <- vector();
if ((i - k_count) > 0 ){
MessageOutput(".", "", NULL);
k_count <- k_count +10;
}
for(j in scan_names){
if(!identical(RawData@assayData[[j]]@mz, numeric(0))){
scan_count <- scan_count +1;
mz_vec <- c(mz_vec, RawData@assayData[[j]]@mz)
}
}
topScan_stats[i,3] <- scan_count/scan_totalLength;
topScan_stats[i,4] <- mean(mz_vec);
if(topScan_stats[i,3] < 0.5){
break;
}
}
MessageOutput("Done!", "\n", NULL)
#save(topScan_stats, file = "topScan_stats.rda")
mzs_toRemove <- topScan_stats[topScan_stats[,3] > 0.5, 4]
raw_data_clean <- mz.trim_specific(raw_data, ms_list, -mzs_toRemove, mzdiff=10)
MessageOutput(paste0(length(mzs_toRemove), " potential contaminamts will not be used for parameters optimization !\nGoing to the next step..."),
"\n",
NULL)
raw_data_clean <- .emptyscan.remove(raw_data_clean, ms_list);
return(raw_data_clean);
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.