R/GC-MS functions.R

Defines functions erah_pipeline .update_feature_definitions .peakIndex .getPeakGroupsRtMatrix adjustRtimePeakGroups testpar Par_peakintegrate_slow Par_peakintegrate RT_correction_xcms RT_correction_LORESS RT_correction multimodel PeakDiagnosis MassHunter_id_Summary mgf_id_Summary amdis_id_Summary MZrange_autodetect GCMS_integration

Documented in amdis_id_Summary GCMS_integration MassHunter_id_Summary MZrange_autodetect PeakDiagnosis

#' GCMS_integration
#'
#' This is a function for peak integration according to the IDs in identification summary
#'
#' @param output specify the output file name initial 
#' @param Ion.bin the mz tolerance (Da) for peak integration
#' @param workdir locate the working Dir
#' @param intensity_type the type of signal to be reported \code{"Peak Height"} or \code{"Peak Area"}
#' @param GGReport the reporting mode \code{"Slow"} plot every integrated peak, or \code{"Fast"} generate an overall report 
#'
#' @return None
#'
#' @examples
#' GCMS_integration()
#'
#' @export


GCMS_integration<-function(output = "GC-MS Result",
                           Ion.bin= 0.5,
                           workdir=getwd(),
                           intensity_type=c("Peak Area","Peak Height"),
                           GGReport=c("Fast","Slow")
) {
  require(flux)
  require(parallel)
  require(xcms)
  require(lattice)
  require(plyr)
  require(tcltk)
  require(mzR)
  require(Rcpp)
  require(data.table)
  setwd(workdir)
  final.check.data <- read.csv(paste0(tcltk::tk_choose.files(caption = "Load a Summary Report.csv",  multi = FALSE),collapse = " "),stringsAsFactors=FALSE)
  
  final.check.data <- final.check.data[final.check.data$Total.ID!=0,]
  check.Ref.ion <-table(is.na(final.check.data$Ref.ion))["TRUE"]
  check.Ref.ion[is.na(check.Ref.ion)]<-0
  if(check.Ref.ion>=1){tkmessageBox(message = "ERROR: Missing reference ions in the Identification Report!", icon = "warning", type = "ok"); stop("Missing reference ion")}
  
  ion.lib<-final.check.data
  files <- c(dir())
  info <- file.info(files)
  isDir <- info$isdir
  conditions_pre1 <- c(files[isDir == TRUE])
  
  conditions_pre<-c(conditions_pre1 ,"Manually choose folder")
  
  conditions <- tk_select.list(conditions_pre, multiple = FALSE,
                               title = paste("Folder contains all cdf files"))
  
  if (conditions=="Manually choose folder"){ conditions<-tk_choose.dir(default = "", caption = "A folder contains all cdf files")}
  
  ###
  
  library_file_org=ion.lib[,c("Name","Ref.ion", "RT.median", "RT.shfit.Lower", "RT.shfit.upper","Peak.width")]
  library_file_org<-library_file_org[!is.na(library_file_org$RT.median),]
  final.df<-NULL
  remove(final.df)
  Graphic.df<-data.frame()
  filenames <- dir(conditions)
  filenames <- filenames[grep(c(".cdf$|.mzXML$"),filenames, ignore.case=TRUE)]
  thread <- round(nrow(library_file_org)/200,digits = 0)
  if (thread>=6) {thread=6}
  if (thread<1) {thread=1}
  
  for (q in 1:length(filenames)) {
    findScanTime=NULL
    res<-try(findScanTime <- xcmsRaw(filename = paste(conditions, filenames[q], sep = "/"),))
    if(class(res) == "try-error"){
     #scanextract=stringr::str_extract_all(res[1],"scan.....")[[1]]
     #scanextract=gsub("scan","",scanextract)
     #scanextract=gsub(" ","",scanextract)
     #scanextract=as.numeric(scanextract)
     
     filepath <- xcmsSource(paste(conditions, filenames[q], sep = "/"))
     rawdata <- loadRaw(filepath, includeMSn = F)
     mzrange_file<-range(rawdata$mz,na.rm = T)
     scanrange_file<-max(which((min(which(is.na(rawdata$mz)))>=rawdata$scanindex)))-1
     res<-try(findScanTime <- xcmsRaw(filename = paste(conditions, filenames[q], sep = "/"),scanrange = c(1,scanrange_file)))
     #res<-try(findScanTime <- xcmsRaw(filename = paste(conditions, filenames[q], sep = "/"),scanrange = c(1,length(rawdata$scanindex))))
     #res<-try(findScanTime <- xcmsRaw(filename = paste(conditions, filenames[q], sep = "/"),scanrange = 1:10))
    } 
    
    
    if(class(res) != "try-error"){
    ScanPerSec <- round(length(findScanTime@ scantime)/(findScanTime@ scantime[length(findScanTime@ scantime)]-findScanTime@ scantime[1]),2)
    scanrangeL <- min(findScanTime@scantime)
    scanrangeU <- max(findScanTime@scantime)
    message(paste("Scan rate:", ScanPerSec,"scan/second"))
    
    library_file <- library_file_org
    library_file$RT.shfit.Lower[is.na(library_file_org$RT.shfit.Lower)]<-0 ## testing
    library_file$RT.shfit.Lower<-library_file_org$RT.shfit.Lower + library_file_org$Peak.width/ScanPerSec
    
    library_file$RT.shfit.upper[is.na(library_file_org$RT.shfit.upper)]<-0 ## testing
    library_file$RT.shfit.upper<-library_file_org$RT.shfit.upper +library_file_org$Peak.width/ScanPerSec
    library_file$Ref.ion=as.numeric(as.character(library_file$Ref.ion))
    library_file<-library_file[!is.na(library_file$RT.median),]
    
    file1 <- filenames[q]
    name.file <- file1
    name.file <- gsub(".CDF", "", name.file, ignore.case=TRUE)
    surefinal <- data.frame(Name=ion.lib[,"Name"],Base.Peak=0)
    raw_data <- findScanTime
    
    if (GGReport=="Slow"){
    
      
        integrationresult=parLapply(cl=autoStopCluster(makeCluster(thread)),
                                             1:nrow(library_file),
                                             Par_peakintegrate_slow,
                                             library_file,
                                             findScanTime,
                                             raw_data,
                                             intensity_type,
                                             Ion.bin,
                                             name.file)
        
        
        Graphic.dflist<-list()
        for (h in 1:nrow(library_file)){
          surefinal$Base.Peak[h]<-integrationresult[[h]]["Peakvalue"]
          Graphic.dflist[[h]]<-as.data.frame(integrationresult[[h]]["Graphic.df"])
          #message(h)
        }
        surefinal$Base.Peak=unlist(surefinal$Base.Peak)
        Graphic.df<-rbind(Graphic.df,data.table::rbindlist(Graphic.dflist))
        #Graphic.df<-dplyr::bind_rows(Graphic.dflist)
  
    }
    
    
    if (GGReport=="Fast"){
    
    
    surefinal$Base.Peak=unlist(parLapply(cl=autoStopCluster(makeCluster(thread)),
                                         1:nrow(library_file),
                                         Par_peakintegrate,
                                         library_file,
                                         findScanTime,
                                         raw_data,
                                         intensity_type,
                                         Ion.bin))
    }
    
    
    
    surefinal <- surefinal[!duplicated(surefinal$Name),]
    names(surefinal)[2] <- name.file
    
    if (!exists("final.df")){
      final.df <- as.data.frame(surefinal)
      colnames(final.df)=c("Name",filenames[q])
      confirmation <- paste("File", q, "(", name.file,")", "done!", sep = " ")
      message(confirmation)
    } else {
      colnames=colnames(final.df)
      final.df <- cbind(final.df,surefinal[-1])
      
      colnames(final.df)=c(colnames,filenames[q])
      confirmation <- paste("File", q, "(", name.file,
                            ")", "done!", sep = " ")
      message(confirmation)
    }
    
    write.csv(final.df,file = "Integration report tempfile.csv",row.names = F)
    
    }
  }
  
  message("Estimating errors")
  
  
  
  ### setup warnign message
  
  if(class(ion.lib$CAS)!="NULL"){
    finalX.df <- cbind(CAS=ion.lib$CAS, Ref.Ion=ion.lib$Ref.ion, Total.ID=ion.lib$Total.ID, Library.match=ion.lib$Library.match, Ret.Time=ion.lib$RT.median, RT.shfit.Lower=ion.lib$RT.shfit.Lower, Warnings=NA, final.df)
    
    finalX.df <- finalX.df[order(finalX.df$Ret.Time , decreasing = FALSE),]
    parameterLlibraryMatch <- 40
    parameterRTShif <- 60
    parameterSimilarRT <- 0.01
    parameterN <-0.05 # Percentage
    
    # warning matrix
    warning.df <- data.frame(matrix(vector(), length(finalX.df[,1]), 6, dimnames=list(c(), c("LowID", "Library.match.d", "RT.shift.d", "Repeated.aboudnace", "Repeated.RT", "Close.RT"))), stringsAsFactors=F)
    
    # filter low match filter
    low.match<- which(finalX.df$Library.match <  parameterLlibraryMatch)
    warning.df[row.names(warning.df) %in% low.match,"Library.match.d"]<-"Library.Match<40%/"
    
    
    # Filter low RT shift
    high.RT.shift <- which(finalX.df$RT.shfit.Lower >  parameterRTShif )
    warning.df[row.names(warning.df) %in%  high.RT.shift,"RT.shift.d"]<-"RT.shift>60s/"
    
    # Filter low Number of ID
    
    low.nID <- which(finalX.df$Total.ID  <  parameterN*length(filenames))
    
    warning.df[row.names(warning.df) %in%   low.nID,"LowID"]<-"Total.ID<5%/"
    low.nID <- which(finalX.df$Total.ID  ==1 )
    warning.df[row.names(warning.df) %in%   low.nID,"LowID"]<-"Total.ID=1/"
    
    total.n<-length(filenames)
    ID_name<-paste("Total.ID(n=",total.n,")",sep="")
    names(finalX.df)[3]<-ID_name
    
    
    
    # Remove the repeat aboundace
    last.postion <- 8 + length(filenames)
    dulpicated_aboundace<-unique(finalX.df[duplicated(finalX.df[, last.postion]), last.postion])
    Repeate.aboundance<-NULL
    if (length(dulpicated_aboundace)!=0){
      for(n in 1: length(dulpicated_aboundace)){
        Repeat.All<- finalX.df[finalX.df[,last.postion]==dulpicated_aboundace[n],]
        Repeate.aboundance <-rbind(Repeate.aboundance, Repeat.All[-which.max(Repeat.All$Library.match),])
      }
      warning.df[row.names(warning.df) %in% row.names(Repeate.aboundance),"Repeated.aboudnace"]<-"Repeated.Level/"
    }
    
    # Remove the repated RT
    dulpicated_RT<-unique(finalX.df$Ret.Time[duplicated(finalX.df$Ret.Time)])
    
    Repeate.RT<-NULL
    if (length(dulpicated_RT)!=0){
      for(n in 1: length(dulpicated_RT)){
        Repeat.All<- finalX.df[finalX.df$Ret.Time==dulpicated_RT[n],]
        Repeate.RT <-rbind(Repeate.RT, Repeat.All[-which.max(Repeat.All$Library.match),])
      }
      warning.df[row.names(warning.df) %in% row.names(Repeate.RT),"Repeated.RT"]<-"Repeated.RT/"
    }
    
    # Identfiy similar RT
    close.RT<-NULL
    for(i in 1:(length(finalX.df[,1])-1)){
      n=i+1
      diff <-finalX.df$Ret.Time[n] - finalX.df$Ret.Time[i]
      if(diff <=  parameterSimilarRT & diff!=0 ){
        close.df <- finalX.df[i:n,]
        close.RT <- rbind(close.RT,  close.df[-which.max(close.df$Library.match),])
        warning.df[row.names(warning.df) %in% row.names(close.RT),"Close.RT"]<-"Similar.RT/"
      }
    }
    
    warning.list <- paste(warning.df[,1], warning.df[,2], warning.df[,3], warning.df[,4], warning.df[,5], warning.df[,6], sep = ' ')
    warning.list <- gsub("NA","", warning.list)
    warningZ.list <- gsub("  ","", warning.list)
    warningY.list <- gsub(" ","", warning.list)
    warningX.list <- gsub(".$", "",warningY.list)
    finalX.df$Warnings <-warningX.list
    finalF.df <- finalX.df
    finalZ.df<-finalX.df
    finalZ.df$Warnings <-warningZ.list
    
    erroreport<- finalZ.df
    finalF_errorplot <-erroreport[order(erroreport$Name , decreasing = FALSE),]
    greystrip<-gsub(" ", "grey95",finalF_errorplot$Warnings)
    greystrip[greystrip!="grey95"]<-"firebrick1"
    redwarning<<-greystrip
    
  } else {finalF.df=cbind( Ref.Ion=ion.lib$Ref.ion, Ret.Time=ion.lib$RT.mean,RT.shift=ion.lib$RT.shift, final.df)
  finalF.df <- finalF.df[order(finalF.df$Ret.Time , decreasing = FALSE),]
  }
  
  ## Save Files
  message("Save file")
  if (GGReport=="Slow"){
    colnames(Graphic.df)=stringr::str_replace(colnames(Graphic.df),"Graphic.df.","")
    write.csv(Graphic.df,file="GC-Peak Diagnosis Report.csv")
    }
  ifelse(intensity_type=="Peak Height",ValueType<-"PeakHeight",ValueType<-"PeakArea")
  sheet <- paste(output,"(", ValueType ,")", sep="")
  store <- paste(workdir, "\\", sheet, ".csv", sep = "")
  FileName <- tclvalue(tkgetSaveFile(initialfile=store))
  write.csv(finalF.df, file = FileName, row.names = FALSE)
  
  message(paste("Data process is completed and saved as: ",sheet,".csv", sep=""))
}

#' mz range detection
#'
#' This is a function that help user to determine the sharing mz range acrossing data files. 
#'
#' @param QCfiles specify the data file(s) to find the mz range
#' 
#' @return a vector contains mz lower/upper limit 
#'
#' @examples
#' MZrange_autodetect()
#'
#' @export


MZrange_autodetect<-function(){
  mz_L=0
  mz_U=1000
  require(xcms)
  require(lattice)
  require(plyr)
  require(tcltk)
  require(mzR)
  require(Rcpp)
  QCfiles = tk_choose.files(multi = T, caption = "Select a data file to detect the mz range")
  matchtype=c("mzxml$","cdf$")
  #datafiles=gsub(paste0(dirname(datafiles)[1],"/"),"",datafiles)
  
  QCfiles=QCfiles[grep(pattern =paste(matchtype,collapse="|"),x=QCfiles,ignore.case = T)]
  openCDFpar<-function(h,QCfiles){
    require(xcms)
    try(findScanTime <- xcmsRaw(filename = QCfiles[h])) 
    findScanTime@mzrange
    
  }
  
  #mzranges<-parLapply(cl=autoStopCluster(makeCluster(detectCores())),1:length(QCfiles),openCDFpar,QCfiles)
  for (QCfile in QCfiles){
    findScanTime <- xcmsRaw(filename = QCfile)
    if (mz_L<=min(findScanTime@mzrange)) mz_L=min(findScanTime@mzrange)
    if (mz_U>=max(findScanTime@mzrange)) mz_U=max(findScanTime@mzrange)  
  }
  
  
  
  #mz_L <- tclVar(mz_L)
  #mz_U <- tclVar(mz_U)
  #tkconfigure(MZrange_L,variable=mz_L)
  #tkconfigure(MZrange_U,variable=mz_U)
  #MZrange_L <- tkentry(TMS, textvariable=mz_L ,width=6)
  #MZrange_U <- tkentry(TMS, textvariable=mz_U ,width=6)
  #tkgrid(tklabel(frameOverall,text="    "))
  #try(tclSetValue("mz_L", mz_L))
  #try(tclSetValue("mz_U", mz_U))
  return(c(mz_L,mz_U))
}


#' AMDIS report file parsing and summary
#'
#' This is a function to process the AMDIS_report.txt file. specify a in-house library or a subset of NIST library to find the most intense quantification ion within the GC-MS experiment m/z range. The script will invoke selection windows to let the user specify the files location.  
#'
#' @param workdir locate the working Dir
#' @param MS.L specify the library file
#' @param MsLibrary specify the library file origin, could be "NIST" or "InHouse"
#' @param amdis.report specify the identification report file
#' @param Ret.Time.Filter set the retetion time filter window around the expected retention time (in +/-min).
#' @param RT.shift.limt set a threshole in seconds to define a molecule have 
#' @param mz_L lower mz limit for quantification ion selection
#' @param mz_U upper mz limit for quantification ion selection
#' @param generate_rt_shift_graph Set TRUE to generate the retetion time shift plots for each molecules that have retention time discrepencies greater than the \code{"RT.shift.limt"}
#' @param RTcorrection Set TRUE to enable a retention time correction before multiple peaks resolving
#' @return None
#'
#' @examples
#' amdis_id_Summary()
#'
#' @export
amdis_id_Summary<-function(workdir= NULL,
                           MS.L= NULL,
                           amdis.report = NULL,
                           File.name = "Summary Report.csv",
                           MsLibrary=c("NIST", "InHouse"),
                           Ret.Time.Filter=2.5,
                           RT.shift.limt = 60,
                           mz_L=38,
                           mz_U=550,
                           generate_rt_shift_graph=F,
                           generate_rt_shift_graphs=F,
                           RTcorrection=F
){
  library("tcltk")
  library("tcltk2")
  # require("Metab")
  library("pander")
  library("parallel")
  #library("OneR")
  library("dplyr")
  library("ggplot2")
  library("ggpubr")
  #workdir <- workdir
  if (is.null(workdir)){workdir= tk_choose.dir(caption = "Select working directory")}

  if (is.null(amdis.report)){amdis.report = tk_choose.files(default = "Select the AMDIS report in .TXT",
                                                            multi = FALSE, caption = "Select the AMDIS report in .TXT",
                                                            filter=matrix(c("Text", ".txt", "All files", "*"),2, 2, byrow = TRUE))}
  
    if (is.null(MS.L)){MS.L= tk_choose.files(caption="Select MS library (e.g. SVB_total or NIST) in .msl",
                                           filter=matrix(c("MSL", ".msl", "All files", "*"),2, 2, byrow = TRUE))}
  setwd(workdir)
  amdis.report = data.table::fread(amdis.report,sep = "\t", stringsAsFactors=FALSE)
  message("amdis_id_Summary parameters")
  message(paste("RT filter: ", Ret.Time.Filter, " min", sep=""))
  message(paste("Detect mutiple peaks when RT range is greater than: ", RT.shift.limt, " s", sep=""))
  message(paste("Quant mass will be selected within: ", mz_L, "-", mz_U,sep=""))
  message(paste("Perform RT correction before multi-peak resolving: ",RTcorrection ,sep=""))
  # Extract Reference ion, CAS # from MS library
  lib.txt<-readLines(MS.L)
  lib.txt1<-lib.txt[-grep("^[ ]*$",lib.txt)]
  att.nms<-unique(sapply(strsplit(lib.txt1[grep(":",lib.txt1)],":"),function(x) x[1]))
  entry.nms<-sapply(strsplit(lib.txt1[grep("NAME:",lib.txt1)],"ME:"), function(x) x[2])
  rez<-matrix(NA, nrow=length(entry.nms), ncol=length(att.nms), dimnames=list(NULL,att.nms ))
  starts<-grep("NAME:", lib.txt)
  stops<-c(starts[1:(length(starts)-1)]+diff(starts)-1,length(lib.txt))
  for (i in 1:length(starts)){
    tmp<-lib.txt[starts[i]:stops[i]]
    sapply(strsplit(tmp[grep(":",tmp)],":"), function(x) rez[i,x[1]]<<-x[2])
  }
  
  # ExtractIon for NIST library
  if(MsLibrary=="NIST"){
    message("Running in NIST mode, will generate Ref ions for each metabolites within mz detection range")
    substrRight <- function(x, n){
      substr(x, nchar(x)-n+1, nchar(x))
    }
    reference_ion<-NULL
    
    #test<-buildLib(AmdisLib=MS.L, folder=workdir, save = F, output = "ion_lib.csv", verbose = F,mz_L = mz_L,mz_U = mz_U)
    
    
    #reference_ion=parLapply(cl=autoStopCluster(makeCluster(detectCores()-1)), 1:length(starts),parse_msl_par,lib.txt,starts,stops,mz_L,mz_U)
    
    #reference_ion=unlist(reference_ion)
    
    #sapply(strsplit(tmp[grep(":",tmp)],":"), function(x) rez[i,x[1]]<<-x[2])
    reference_ion=0
    
    rez<-data.frame(rez,stringsAsFactors=FALSE)
    rez$NAME<-entry.nms
    libr<-cbind(rez,reference_ion)## returns list of peaks
    libr$NAME <- gsub("?", "", libr$NAME, fixed = TRUE)
    libr$NAME <- gsub("^ ", "", libr$NAME, perl=T)
    libr$NAME <- gsub("^ ", "", libr$NAME, perl=T)
    libr$NAME <- gsub("[*:%^!;$]", "", libr$NAME, perl=T)
    
    # Generate Summary report
    AMDIS.report <-amdis.report
    AMDIS.report$Name <- gsub("?", "", AMDIS.report$Name, fixed = TRUE)
    AMDIS.report$Name <- gsub("^ ", "", AMDIS.report$Name, perl=T)
    AMDIS.report$Name <- gsub("[*:%^!*?&;$]", "", AMDIS.report$Name, perl=T)
    AMDIS.report$Width <- gsub(">", "",  AMDIS.report$Width, perl=T)
    AMDIS.report$Width <- as.numeric(gsub("scans", "",  AMDIS.report$Width, perl=T))
    #AMDIS.reportcas<-AMDIS.report[AMDIS.report$CAS %in% libr$CASNO,]
    AMDIS.report<-AMDIS.report[AMDIS.report$Name %in% gsub("^ ","",libr$NAME),]
    #AMDIS.report1<-AMDIS.report[AMDIS.report$Name %in% libr$NAME,]
    #
    
    if (RTcorrection){
      library(ggpubr)
      RT.correction.grid<-RT_correction_xcms()
      RT.correction.result<-RT_correction(df=AMDIS.report)
      AMDIS.report<-RT.correction.result[[1]]
      ggpubr::ggarrange(RT.correction.result[[2]],RT.correction.result[[3]],
                        labels = c("Before","After"),common.legend = T,
                        ncol = 2, nrow =1,vjust=1.5) %>%  ggexport(filename = paste(getwd(),"/retention time correction.png",sep=""),width = 1800, height = 900,verbose = NULL,res = 100)
      
    }
    Metabolite.list <-split(AMDIS.report$RT, AMDIS.report$Name)
    RT.stats<-t(sapply(Metabolite.list,function(x) c(RT.median=round(median(x,na.rm=TRUE),3),
                                                     RT.shift=round((median(x,na.rm=TRUE)-(quantile(x, prob = 0.25,na.rm=TRUE)-IQR(x,na.rm=TRUE)*1.5))*60,2),
                                                     RT.shift=round(((quantile(x, prob = 0.75,na.rm=TRUE)+IQR(x,na.rm=TRUE)*1.5)-median(x,na.rm=TRUE))*60,2)
    )))
    colnames(RT.stats)<-c("RT.median","RT.shfit.Lower", "RT.shfit.upper")
    RT.stats=RT.stats[is.na(RT.stats[,"RT.median"])!=T,]
    Peak.width <- sapply(split(AMDIS.report$Width, AMDIS.report$Name), function(x) median(x))
    
    
    ID.stats<-t(sapply(split(AMDIS.report$Weighted, AMDIS.report$Name),function(x) c(Library.match=round(mean(x,na.rm=T)),
                                                                                     Total.ID=length(x[!is.na(x)]))))
    #message("Running in NIST mode, will generate Ref ions for each metabolites within mz detection range")
    AMDIS.report.RT.stats <- data.frame(cbind(Name=0,Ref.ion=0, ID.stats, RT.stats, Peak.width, CAS=0))
    #message(paste(AMDIS.report.RT.stats$Ref.ion[1:40],sep="\n"))
    #AMDIS.report.RT.stats$Ref.ion <-  libr$reference_ion[match(strtrim(rownames(RT.stats),70),strtrim(libr$NAME,70))]
    #matchID=match(rownames(RT.stats),libr$NAME)
    matchID=match(strtrim(rownames(RT.stats),170),strtrim(libr$NAME,170))
    #AMDIS.report.RT.stats=AMDIS.report.RT.stats[which(!is.na(matchID)),]
    #matchID70=match(strtrim(rownames(RT.stats),70),strtrim(libr$NAME,70))
    #intersect(matchID,matchID70)
    AMDIS.report.RT.stats$Ref.ion <-  unlist(parallel::parLapply(cl=autoStopCluster(makeCluster(detectCores()-1)), 
                                                                 matchID,
                                                                 parse_msl_par,lib.txt,starts,stops,mz_L,mz_U))
    
    #for (ids in matchID){
    # Ref.iontest= parse_msl_par(ids,lib.txt,starts,stops,mz_L,mz_U)
      
    #}
    #match("2-Cyclopropylethynylcyclopropane",libr$NAME)
    #message(AMDIS.report.RT.stats$Ref.ion)
    #which(match(rownames(RT.stats),libr$NAME)==NA)
    AMDIS.report.RT.stats$CAS <-  libr$CAS[match(strtrim(rownames(RT.stats),70),strtrim(libr$NAME,70))]
    AMDIS.report.RT.stats$Name <- rownames(RT.stats)
    
    #total.n<-length(unique(AMDIS.report$FileName))
    #ID_name<-paste("Total.ID(n=",total.n,")",sep="" )
    #names(AMDIS.report.RT.stats)[3]<-ID_name
    
    
    ## Remove point outside the range
    RT.list<-lapply(Metabolite.list,function(x) c(RT.shift=(quantile(x, prob = 0.25,na.rm=TRUE)-IQR(x,na.rm=TRUE)*1.5),
                                                  RT.shift=(quantile(x, prob = 0.75,na.rm=TRUE)+IQR(x,na.rm=TRUE)*1.5)))
    Remove.outlier<-function(x,y){
      x[x >= y[1]]
      x[x <= y[2]]
    }
    
    Remove.metabolite <-mapply(Remove.outlier, Metabolite.list, RT.list )
    
    # Plot all retention time of all metabolites
    
    n.total <- length(Metabolite.list)
    png(paste(getwd(),"/retention time of all metabolites.png",sep=""),width = 30,height = 30, bg = "white",units = "in",res = 150)
    par(mar=c(5,10,5,5))
    boxplot(Metabolite.list, xlab= "Retention time (min)", cex.axis=0.2,  horizontal = TRUE, las=2, xaxt="n", main = paste("Retention time distribution of all metabolites ","(n=",n.total,")",sep=""))
    axis(1,cex.axis=1,las=1)
    dev.off()
    
    
    ## solve multimodel issue
    
    
    
    AMDIS.report.split<-NULL
    deletenamelist<-NULL
    save.folder<-paste("RTshift_warnings_",RT.shift.limt,"s_","shift",sep="")
    if (dir.exists(save.folder)!=T) try(dir.create (save.folder))
    
    for (component in 1:length(Remove.metabolite)){
      if(`&`((RT.stats[component,2] + RT.stats[component,3]) > RT.shift.limt, T)){
        #names(Remove.metabolite)[component]
        RT_libsimilarity=NULL
        #plot figure
        names.var<- windows_filename(names(Remove.metabolite)[component])
        RT_libsimilarity<-AMDIS.report[AMDIS.report$Name==names(Remove.metabolite)[component],]
        #nbins=6
        #res <- try(RT_libsimilarity$peakgroup<-OneR::bin(RT_libsimilarity$RT,nbins = nbins,labels = c(paste("Peak",1:nbins)), method = "clusters",na.omit = T),silent = TRUE)
        #if (class(res) != "try-error"){
        #while(`&`(sum(grep("Peak",RT_libsimilarity$peakgroup))==0,nbins>=1)){
        #  nbins=nbins-1
        #  res <- try(RT_libsimilarity$peakgroup<-OneR::bin(RT_libsimilarity$RT,nbins = nbins,labels = c(paste("Peak",1:nbins)), method = "clusters",na.omit = T),silent = TRUE)
        #}
        #RT_libsimilarity$peakgroup_RT_median=RT_libsimilarity$RT
        #if (sum(grep("Peak",RT_libsimilarity$peakgroup))!=0){
        #  RT_libsimilarity$peakgroup_RT_median=NULL
        #  RT_libsimilarity.stats<-RT_libsimilarity %>% group_by(peakgroup) %>% summarise(peakgroup_RT_median = median(RT))
         # RT_libsimilarity=merge(RT_libsimilarity,RT_libsimilarity.stats)
          
         # }
        #RT_libsimilarity<-AMDIS.report[AMDIS.report$Name==names(Remove.metabolite)[component],]
        #RT_libsimilarity$peakgroupRT=median(RT_libsimilarity$peakgroup)
        #RT_libsimilarity$series=1:nrow(RT_libsimilarity)
        #try(RT_libsimilarity<-RT_libsimilarity[order(RT_libsimilarity$FileName),])

        #estimate number of model
        m.var <- data.frame(Remove.metabolite[component])[,1]
        metabolite.name<-names(Remove.metabolite[component])
        res=NULL
        res <- try(density.all <- density(m.var, bw = "SJ"))
        if(class(res)=="try-error"){density.all <- density.default(m.var)}
        density.var <- density.all$y
        density.x <-  density.all$x[density.var>0.01]
        density.var <-  density.var[density.var>0.01]
        
        modes <- NULL
        centre.p <- NULL
        for ( i in 2:(length(density.var)-1) ){
          if ( (density.var[i] > density.var[i-1]) & (density.var[i] > density.var[i+1]) ) {
            modes <- c(modes,i)
          }
          if ( (density.var[i] < density.var[i-1]) & (density.var[i] < density.var[i+1]) ) {
            centre.p <- c(centre.p,i)
          }
        }
        
        new.median<- density.x[modes]
        new.boundary<- density.x [centre.p]
        npeaks <-length(new.median)
        
        if(npeaks>1){
          
          vComposti <- vector("list", npeaks)
          m.var.sub <- m.var
          
          for (i in 1:(npeaks-1)){
            vComposti[i] <-list(m.var.sub [m.var.sub <new.boundary[i]])
            m.var.sub[m.var.sub<new.boundary[i]]<-NA
          }
          
          vComposti[npeaks] <- list(m.var.sub)
          
          RT.add.stats <- t(sapply(vComposti,function(x) c(Total.ID=length(x[!is.na(x)]),
                                                           RT.median=round(median(x,na.rm=TRUE),3),
                                                           RT.shift=round((median(x,na.rm=TRUE)-(quantile(x, prob = 0.25,na.rm=TRUE)-IQR(x,na.rm=TRUE)*3))*60,2),
                                                           RT.shift=round(((quantile(x, prob = 0.75, na.rm=TRUE)+IQR(x,na.rm=TRUE)*3)-median(x,na.rm=TRUE))*60,2))))
          
          colnames(RT.add.stats)<-c("Total.ID","RT.median","RT.shfit.Lower", "RT.shfit.upper")
          AMDIS.report.RT.stats.add <- AMDIS.report.RT.stats[rownames(AMDIS.report.RT.stats)==metabolite.name,]
          AMDIS.report.RT.stats.add <- cbind(AMDIS.report.RT.stats.add[,1:3] ,RT.add.stats, AMDIS.report.RT.stats.add[,8:9], row.names = NULL)
          
          # re-name
          for (i in 1:npeaks){
            AMDIS.report.RT.stats.add$Name[i] <- paste(metabolite.name, " (split peak ", i, ")", sep="")
          }
          AMDIS.report.split <- rbind(AMDIS.report.split, AMDIS.report.RT.stats.add)
          deletenamelist<-c(deletenamelist,metabolite.name)

        }          
        if (generate_rt_shift_graph){          
          #png(filename=paste(save.folder,"/",names.var,".png",sep=""),width = 480,height = 720,)
          #par(mfrow = c(2, 1))
          #png(filename=paste(save.folder,"/",names.var,"lib_score.png",sep=""),width = 900,height = 1200)
          #par(mfrow = c(2, 3))
            #RT_libsimilarity<-AMDIS.report[AMDIS.report$Name==names(Remove.metabolite)[i],]
            #RT_libsimilarity$peakgroup="Outliner"
            #RT_libsimilarity$peakgroup_RT_median=RT_libsimilarity$RT

          names.var<- windows_filename(names(Remove.metabolite)[component])          

          
          #boxplotlibsim=as_ggplot(boxplot(Weighted~peakgroup,data=RT_libsimilarity, main=names(Remove.metabolite)[i], xlab="Retention time (min)", ylab="Library Score"))

          #rtdensityplot=plot(data.frame(Remove.metabolite[i])[,1],1:length(data.frame(Remove.metabolite[i])[,1]),  ylab = "Density",xlab= "Retention time (min)", main = names(Remove.metabolite)[i], lwd=1.5)
          #dev.off()
          #plot figure
          
            png(filename=paste(save.folder,"/",names.var,".png",sep=""))
            #png(filename=paste(save.folder,"/","aaa",".png",sep=""))
            
            plot(RT_libsimilarity$RT,RT_libsimilarity$Net,  ylab = "Net score",
                 xlab= "Retention time (min)", main = names(Remove.metabolite)[component], lwd=1.5)
            dev.off()
          
          
          #png(filename=paste(save.folder,"/",names.var,"_lib_score.png",sep=""),width = 400, height = 400)

          
          }
        if (generate_rt_shift_graphs){          
            #png(filename=paste(save.folder,"/",names.var,".png",sep=""),width = 480,height = 720,)
            #par(mfrow = c(2, 1))
            #png(filename=paste(save.folder,"/",names.var,"lib_score.png",sep=""),width = 900,height = 1200)
            #par(mfrow = c(2, 3))
            #RT_libsimilarity<-AMDIS.report[AMDIS.report$Name==names(Remove.metabolite)[i],]
            RT_libsimilarity$peakgroup="Outliner"
            RT_libsimilarity$peakgroup_RT_median=RT_libsimilarity$RT
            for (peak in 1: nrow(AMDIS.report.RT.stats.add)){
              RT_libsimilarity$peakgroup=ifelse(`&`(RT_libsimilarity$RT>=(AMDIS.report.RT.stats.add$RT.median[peak]-AMDIS.report.RT.stats.add$RT.shfit.Lower[peak]/60),
                                                    RT_libsimilarity$RT<=(AMDIS.report.RT.stats.add$RT.median[peak]+AMDIS.report.RT.stats.add$RT.shfit.upper[peak]/60)),paste("Peak",peak),RT_libsimilarity$peakgroup)
              RT_libsimilarity$peakgroup_RT_median=ifelse(`&`(RT_libsimilarity$RT>=(AMDIS.report.RT.stats.add$RT.median[peak]-AMDIS.report.RT.stats.add$RT.shfit.Lower[peak]/60),
                                                              RT_libsimilarity$RT<=(AMDIS.report.RT.stats.add$RT.median[peak]+AMDIS.report.RT.stats.add$RT.shfit.upper[peak]/60)),AMDIS.report.RT.stats.add$RT.median[peak],"")
            }
            names.var<- windows_filename(names(Remove.metabolite)[component])          
            boxplotlibsim <- ggplot(RT_libsimilarity, aes(x=peakgroup, y=Net, fill=peakgroup)) + 
              geom_boxplot() +  geom_jitter(shape=16, position=position_jitter(0.2)) + theme(
                panel.background = element_rect(fill = "white"),
                plot.margin = ggplot2::margin(1, 1, 1, 1, "cm"),
                plot.background = element_rect(
                  fill = "grey90",
                  colour = "black"
                ))
            
            #boxplotlibsim=as_ggplot(boxplot(Weighted~peakgroup,data=RT_libsimilarity, main=names(Remove.metabolite)[i], xlab="Retention time (min)", ylab="Library Score"))
            
            #rtdensityplot=plot(data.frame(Remove.metabolite[i])[,1],1:length(data.frame(Remove.metabolite[i])[,1]),  ylab = "Density",xlab= "Retention time (min)", main = names(Remove.metabolite)[i], lwd=1.5)
            #dev.off()
            
            
            #png(filename=paste(save.folder,"/","aaa",".png",sep=""))
            
            Net.plot=ggplot(RT_libsimilarity, aes(x=RT, y=Net, color=peakgroup,size=2)) + geom_point() + theme(
              panel.background = element_rect(fill = "white"),
              plot.margin = ggplot2::margin(1, 1, 1, 1, "cm"),
              plot.background = element_rect(
                fill = "grey90",
                colour = "black"
              ))
            
            ggpubr::ggarrange(boxplotlibsim,Net.plot,
                              labels = c("boxplot","Net score"),
                              ncol = 2, nrow =1,vjust=1.5) %>%  ggexport(filename = paste(save.folder,"/",names.var,"_lib_score.png",sep=""),width = 900, height = 400,verbose = NULL)
            #dev.off()
          }
      }
    }
    
    if(is.null(AMDIS.report.split)!=TRUE){
      
      deletenamelist=unique(deletenamelist)
      
      AMDIS.report.RT.statscombine<-AMDIS.report.RT.stats[!(AMDIS.report.RT.stats$Name %in% (deletenamelist)),]
      
      AMDIS.report.split=unique(AMDIS.report.split)
      
      AMDIS.report.RT.stats <- rbind(AMDIS.report.RT.statscombine, AMDIS.report.split)
    }
    
    AMDIS.report.RT.stats <- AMDIS.report.RT.stats[order(AMDIS.report.RT.stats$RT.median, decreasing = FALSE),]
    File.nameNIST <- "Summary report_NIST.csv"
    write.csv(AMDIS.report.RT.stats, file = File.nameNIST,row.names = FALSE)
    write.csv(libr, file = "Library summary.csv",row.names = FALSE)
    
    
    
    
    message(paste("Summary report_NIST.csv was created and save in",workdir, sep=" "))
  } else if (MsLibrary=="InHouse") {
    
    rez<-data.frame(rez,stringsAsFactors=FALSE)
    rez$NAME<-entry.nms
    libr<-rez## returns list of peaks
    
    # Generate summary report
    AMDIS.report<-amdis.report
    AMDIS.report<-AMDIS.report[AMDIS.report$RT-AMDIS.report$`Expec. RT` < Ret.Time.Filter,]
    AMDIS.report<-AMDIS.report[AMDIS.report$RT-AMDIS.report$`Expec. RT`> -Ret.Time.Filter,]
    AMDIS.report$Name <- gsub("?", "", AMDIS.report$Name, fixed = TRUE)
    AMDIS.report$Name <- gsub("^ ", "", AMDIS.report$Name, perl=T)
    AMDIS.report$Name <- gsub("[*:%^!*?&;$]", "", AMDIS.report$Name, perl=T)
    AMDIS.report$Width <- gsub(">", "",  AMDIS.report$Width, perl=T)
    AMDIS.report$Width <- as.numeric(gsub("scans", "",  AMDIS.report$Width, perl=T))
    
    
    
    Metabolite.list <-split(AMDIS.report$RT, AMDIS.report$Name)
    
    RT.stats<-t(sapply(Metabolite.list,function(x) c(RT.median=round(median(x,na.rm=TRUE),3),
                                                     RT.shift=round((median(x,na.rm=TRUE)-(quantile(x, prob = 0.25)-IQR(x,na.rm=TRUE)*1.5))*60,2),
                                                     RT.shift=round(((quantile(x, prob = 0.75)+IQR(x,na.rm=TRUE)*1.5)-median(x,na.rm=TRUE))*60,2))))
    colnames(RT.stats)<-c("RT.median","RT.shfit.Lower", "RT.shfit.upper")
    
    Peak.width <- sapply(split(AMDIS.report$Width, AMDIS.report$Name), function(x) median(x))
    
    
    RT.stats[is.na(RT.stats)]<-0
    ID.stats<-t(sapply(split(AMDIS.report$Weighted, AMDIS.report$Name),function(x) c(Library.match=round(mean(x,na.rm=T)),
                                                                                     Total.ID=length(x[!is.na(x)]))))
    AMDIS.report.RT.stats<-NULL
    AMDIS.report.RT.stats <- as.data.frame(cbind(Name=0,Ref.ion=0,ID.stats,RT.stats,Expec.RT=0,Diff.RT=0, Peak.width, CAS=0))
    AMDIS.report.RT.stats$Ref.ion <- libr$RSN[match(rownames(RT.stats),libr$NAME)]
    AMDIS.report.RT.stats$Expec.RT <- round(as.numeric(libr$RT[match(rownames(RT.stats),libr$NAME)]),2)
    AMDIS.report.RT.stats$CAS <- libr$CASNO[match(rownames(RT.stats),libr$NAME)]
    AMDIS.report.RT.stats$Diff.RT <- AMDIS.report.RT.stats$Expec.RT- as.numeric(AMDIS.report.RT.stats$RT.median)
    AMDIS.report.RT.stats$Name <- rownames(RT.stats)
    
    
    
    
    
    ## Remove point outside the range
    RT.list<-lapply(Metabolite.list,function(x) c(RT.shift=(quantile(x, prob = 0.25)-IQR(x,na.rm=TRUE)*1.5),
                                                  RT.shift=(quantile(x, prob = 0.75)+IQR(x,na.rm=TRUE)*1.5)))
    Remove.outlier<-function(x,y){
      x[x >= y[1]]
      x[x <= y[2]]
    }
    
    Remove.metabolite <-mapply(Remove.outlier, Metabolite.list, RT.list )
    
    # Plot all retention time of all metabolites
    n.total <- length(Metabolite.list)
    par(mar=c(5,10,5,5))
    boxplot(Metabolite.list, xlab= "Retention time (min)", cex.axis=0.5,  horizontal = TRUE, las=2, xaxt="n", main = paste("Retention time distribution of all metabolites ","(n=",n.total,")",sep=""))
    axis(1,cex.axis=1,las=1)
    
    
    ## solve multimodel issue
    AMDIS.report.split<-NULL
    
    save.folder<-paste("RTshift_warnings_",RT.shift.limt,"s_","shift",sep="")
    if (dir.exists(save.folder)!=T) try(dir.create (save.folder))
    for (i in 1:length(Remove.metabolite)){
      if(RT.stats[i,2] + RT.stats[i,3] > RT.shift.limt ){
        
        # plot figure
        names.var<- gsub("/", "", names(Remove.metabolite)[i], perl=T)
        png(filename=paste(save.folder,"/",names.var,".png",sep=""))
        plot(data.frame(Remove.metabolite[i])[,1],1:length(data.frame(Remove.metabolite[i])[,1]),  ylab = "Density",
             xlab= "Retention time (min)", main = names(Remove.metabolite)[i],lwd=1.5)
        dev.off()
        
        #estimate number of model
        m.var <- data.frame(Remove.metabolite[i])[,1]
        metabolite.name<-names(Remove.metabolite[i])
        
        density.all <- density(m.var, bw = "SJ")
        density.var <- density(m.var, bw = "SJ")$y
        density.x <-  density.all$x[density.var>0.01]
        density.var <-  density.var[density.var>0.01]
        
        modes <- NULL
        centre.p <- NULL
        for ( i in 2:(length(density.var)-1) ){
          if ( (density.var[i] > density.var[i-1]) & (density.var[i] > density.var[i+1]) ) {
            modes <- c(modes,i)
          }
          if ( (density.var[i] < density.var[i-1]) & (density.var[i] < density.var[i+1]) ) {
            centre.p <- c(centre.p,i)
          }
        }
        
        new.median<- density.x[modes]
        new.boundary<- density.x [centre.p]
        npeaks <-length(new.median)
        
        if(npeaks>1){
          
          vComposti <- vector("list", npeaks)
          m.var.sub <- m.var
          
          for (i in 1:(npeaks-1)){
            vComposti[i] <-list(m.var.sub [m.var.sub <new.boundary[i]])
            m.var.sub[m.var.sub<new.boundary[i]]<-NA
          }
          
          vComposti[npeaks] <- list(m.var.sub)
          
          RT.add.stats <- t(sapply(vComposti,function(x) c(Total.ID=length(x[!is.na(x)]),
                                                           RT.median=round(median(x,na.rm=TRUE),3),
                                                           RT.shift=round((median(x,na.rm=TRUE)-(quantile(x, prob = 0.25,na.rm=TRUE)-IQR(x,na.rm=TRUE)*3))*60,2),
                                                           RT.shift=round(((quantile(x, prob = 0.75, na.rm=TRUE)+IQR(x,na.rm=TRUE)*3)-median(x,na.rm=TRUE))*60,2))))
          
          colnames(RT.add.stats)<-c("Total.ID","RT.median","RT.shfit.Lower", "RT.shfit.upper")
          AMDIS.report.RT.stats.add <- AMDIS.report.RT.stats[rownames(AMDIS.report.RT.stats)==metabolite.name,]
          AMDIS.report.RT.stats.add <- cbind(AMDIS.report.RT.stats.add[,1:3] ,RT.add.stats, AMDIS.report.RT.stats.add[,8:11], row.names = NULL)
          
          # re-name
          for (i in 1:npeaks){
            AMDIS.report.RT.stats.add$Name[i] <- paste(metabolite.name, " (split peak ", i, ")", sep="")
          }
          AMDIS.report.split <- rbind(AMDIS.report.split, AMDIS.report.RT.stats.add)
        }
      }
    }
    
    if(is.null(AMDIS.report.split)!=TRUE){
      AMDIS.report.RT.stats <- rbind(AMDIS.report.RT.stats, AMDIS.report.split)
    }
    if ("CAS" %in% names(AMDIS.report.RT.stats)) {
      library(stringr)
      AMDIS.report.RT.stats$CAS<-gsub(" ","",AMDIS.report.RT.stats$CAS)
      AMDIS.report.RT.stats$CAS<-gsub("-","",AMDIS.report.RT.stats$CAS)
      AMDIS.report.RT.stats_CAS_width<-str_length(AMDIS.report.RT.stats$CAS)
      cas_last<-str_sub(AMDIS.report.RT.stats$CAS,AMDIS.report.RT.stats_CAS_width,AMDIS.report.RT.stats_CAS_width)
      cas_last_23<-str_sub(AMDIS.report.RT.stats$CAS,AMDIS.report.RT.stats_CAS_width-2,AMDIS.report.RT.stats_CAS_width-1)
      cas_last_rest<-str_sub(AMDIS.report.RT.stats$CAS,1,AMDIS.report.RT.stats_CAS_width-3)
      casfinal<-data.frame(a=cas_last_rest,b=cas_last_23,c=cas_last,stringsAsFactors = F)
      cas_final<-unlist(lapply(1:nrow(casfinal),function(x,casfinal){
        str_glue(casfinal$a[x],"-",casfinal$b[x],"-",casfinal$c[x])
      },casfinal))
      AMDIS.report.RT.stats$CAS<-cas_final
    }
    AMDIS.report.RT.stats <- AMDIS.report.RT.stats[order(AMDIS.report.RT.stats$RT.median, decreasing = FALSE),]
    File.name <- tclvalue(tkgetSaveFile(initialfile=File.name))
    write.csv(AMDIS.report.RT.stats, file = File.name,row.names = FALSE)
    print(paste("AMDIS summary report.csv is generated and save in",workdir, sep=" "))
  }
}

#' mgf report file parsing and summary
#'
#' This is a function to process the AMDIS_report.txt file. specify a in-house library or a subset of NIST library to find the most intense quantification ion within the GC-MS experiment m/z range. The script will invoke selection windows to let the user specify the files location.  
#'
#' @param workdir locate the working Dir
#' @param MS.L specify the library file
#' @param MsLibrary specify the library file origin, could be "NIST" or "InHouse"
#' @param amdis.report specify the identification report file
#' @param Ret.Time.Filter set the retetion time filter window around the expected retention time (in +/-min).
#' @param RT.shift.limt set a threshole in seconds to define a molecule have 
#' @param mz_L lower mz limit for quantification ion selection
#' @param mz_U upper mz limit for quantification ion selection
#' @param generate_rt_shift_graph Set TRUE to generate the retetion time shift plots for each molecules that have retention time discrepencies greater than the \code{"RT.shift.limt"}
#' @param RTcorrection Set TRUE to enable a retention time correction before multiple peaks resolving
#' @return None
#'
#' @examples
#' mgf_id_Summary()
#'
#' @export
mgf_id_Summary<-function(workdir= getwd(),
                         mgf.report = NULL,mgf.peak.rds = NULL,
                           Ret.Time.Filter=2.5,
                           RT.shift.limt = 30,
                           generate_rt_shift_graph=T,
                           generate_rt_shift_graphs=F
){
  library("tcltk")
  library("tcltk2")
  # require("Metab")
  library("pander")
  library("parallel")
  #library("OneR")
  library("dplyr")
  library("ggplot2")
  library("ggpubr")
  library(stringr)
  #workdir <- workdir
  if (is.null(workdir)){workdir= tk_choose.dir(caption = "Select working directory")}
  setwd(workdir) 
  if (is.null(mgf.report) ){
    mgf.report = tk_choose.files(default = "Select the mgf report",
                                                            multi = T, caption = "Select the mgf report in .mgf",
                                                            filter=matrix(c("MGF", ".mgf", "All files", "*"),2, 2, byrow = TRUE))
    peaks_list<-NULL
    for (mgf_file in mgf.report){
      lib.txt<-readLines(mgf_file)
      lib.txt1<-lib.txt[-grep("\\t",lib.txt)]
      att.nms<-unique(sapply(strsplit(lib.txt1[grep("=",lib.txt1)],"="),function(x) x[1]))
      entry.nms<-sapply(strsplit(lib.txt1[grep("TITLE=",lib.txt1)],"TITLE="), function(x) x[2])
      rez<-matrix(NA, nrow=length(entry.nms), ncol=length(att.nms), dimnames=list(NULL,att.nms ))
      
      starts<-grep("TITLE=", lib.txt)
      stops<-c(starts[1:(length(starts)-1)]+diff(starts)-1,length(lib.txt))
      for (i in 1:length(starts)){
        tmp<-lib.txt[starts[i]:stops[i]]
        sapply(strsplit(tmp[grep("=",tmp)],"="), function(x) rez[i,x[1]]<<-x[2])
      }
      rez<-as.data.frame(rez)
      rez$file<-stringr::str_remove(basename(mgf_file),".mgf")
      rez->peaks_list[[stringr::str_remove(basename(mgf_file),".mgf")]]
    }
    peaks_list_bind<-as.data.frame(do.call(rbind,peaks_list))
    peaks_list_bind$RT<-as.numeric(peaks_list_bind$RTINMINUTES)
    peaks_list_bind$Name<-peaks_list_bind$TITLE<-str_to_title(peaks_list_bind$TITLE)
  } else if(!is.null(mgf.peak.rds)) {
    peaks_list_raw<-readRDS(mgf.peak.rds)
    lapply(names(peaks_list_raw),function(x,peaks_list_raw){
      peaks_list_raw[[x]]->rez
      rez<-as.data.frame(rez)
      rez$file<-stringr::str_remove(basename(mgf_file),".mgf")
    })->peaks_list
      
    peaks_list_bind<-as.data.frame(do.call(rbind,peaks_list))
    peaks_list_bind$RT<-as.numeric(peaks_list_bind$RTINMINUTES)
    peaks_list_bind$Name<-peaks_list_bind$TITLE<-str_to_title(peaks_list_bind$TITLE)
    }else{
    stop("no mgf input.")
      }
  
  message("mgf_id_Summary parameters")
  
    Metabolite.list <-split(peaks_list_bind$RT, peaks_list_bind$Name)
    RT.stats<-t(sapply(Metabolite.list,function(x) {
      x<-as.numeric(x)
      c(RT.median=round(median(x,na.rm=TRUE),3),
        RT.shift=round((median(x,na.rm=TRUE)-(quantile(x, prob = 0.25,na.rm=TRUE)-IQR(x,na.rm=TRUE)*1.5)),2),
        RT.shift=round(((quantile(x, prob = 0.75,na.rm=TRUE)+IQR(x,na.rm=TRUE)*1.5)-median(x,na.rm=TRUE)),2)
    )}
    ))
    colnames(RT.stats)<-c("RT.median","RT.shfit.Lower", "RT.shfit.upper")
    RT.stats=RT.stats[is.na(RT.stats[,"RT.median"])!=T,]
    
    
    ID.stats<-t(sapply(split(peaks_list_bind$RT, peaks_list_bind$Name),function(x) c(RT.first=x[1],
                                                                                     Total.ID=length(x[!is.na(x)]))))
    Ref.ion.stats<-t(sapply(split(peaks_list_bind$MODELION, peaks_list_bind$Name),function(x) c(Ref.ion=paste(unique(x),collapse = ";"),
                                                                                                Ref.ion.unique=length(unique(x)))))
    report.RT.stats <- data.frame(cbind(Name=0,Ref.ion.stats, ID.stats, RT.stats, CAS=0))
    
    report.RT.stats$Name <- rownames(RT.stats)
    
    RT.list<-lapply(Metabolite.list,function(x) c(RT.shift=(quantile(x, prob = 0.25,na.rm=TRUE)-IQR(x,na.rm=TRUE)*1.5),
                                                  RT.shift=(quantile(x, prob = 0.75,na.rm=TRUE)+IQR(x,na.rm=TRUE)*1.5)))
    Remove.outlier<-function(x,y){
      x[x >= y[1]]
      x[x <= y[2]]
    }
    
    Remove.metabolite <-mapply(Remove.outlier, Metabolite.list, RT.list )
    
    # Plot all retention time of all metabolites
    
    n.total <- length(Metabolite.list)
    png(paste(getwd(),"/retention time of all metabolites.png",sep=""),width = 30,height = 30, bg = "white",units = "in",res = 600)
    par(mar=c(5,10,5,5))
    boxplot(Metabolite.list, xlab= "Retention time (min)", cex.axis=0.2,  horizontal = TRUE, las=2, xaxt="n", main = paste("Retention time distribution of all metabolites ","(n=",n.total,")",sep=""))
    axis(1,cex.axis=1,las=1)
    dev.off()
    
    
    ## solve multimodel issue
    
    
    
    report.split<-NULL
    deletenamelist<-NULL
    save.folder<-paste("RTshift_warnings_",RT.shift.limt,"s_","shift",sep="")
    if (dir.exists(save.folder)!=T) try(dir.create (save.folder))
    
    for (component in 1:length(Remove.metabolite)){
      windows_filename<- function(stringX){
        stringX<-stringr::str_remove_all(stringX,"[><*?:\\/\\\\|]")
        stringX<-gsub("\"", "", stringX)
        if (nchar(stringX)>=100){stringX=strtrim(stringX,100)}
        return(stringX)
        
      }
      
      if(`&`((RT.stats[component,2] + RT.stats[component,3]) > RT.shift.limt, T)){
        RT_libsimilarity=NULL
        
        names.var<- windows_filename(names(Remove.metabolite)[component])
        RT_libsimilarity<-peaks_list_bind[peaks_list_bind$Name==names(Remove.metabolite)[component],]
        m.var <- data.frame(Remove.metabolite[component])[,1]
        metabolite.name<-names(Remove.metabolite[component])
        res=NULL
        res <- try(density.all <- density(m.var, bw = "SJ"))
        if(class(res)=="try-error"){density.all <- density.default(m.var)}
        density.var <- density.all$y
        density.x <-  density.all$x[density.var>0.01]
        density.var <-  density.var[density.var>0.01]
        
        modes <- NULL
        centre.p <- NULL
        for ( i in 2:(length(density.var)-1) ){
          if ( (density.var[i] > density.var[i-1]) & (density.var[i] > density.var[i+1]) ) {
            modes <- c(modes,i)
          }
          if ( (density.var[i] < density.var[i-1]) & (density.var[i] < density.var[i+1]) ) {
            centre.p <- c(centre.p,i)
          }
        }
        
        new.median<- density.x[modes]
        new.boundary<- density.x [centre.p]
        npeaks <-length(new.median)
        
        if(npeaks>1){
          
          vComposti <- vector("list", npeaks)
          m.var.sub <- m.var
          
          for (i in 1:(npeaks-1)){
            vComposti[i] <-list(m.var.sub [m.var.sub <new.boundary[i]])
            m.var.sub[m.var.sub<new.boundary[i]]<-NA
          }
          
          vComposti[npeaks] <- list(m.var.sub)
          
          RT.add.stats <- t(sapply(vComposti,function(x) c(Total.ID=length(x[!is.na(x)]),
                                                           RT.median=round(median(x,na.rm=TRUE),3),
                                                           RT.shift=round((median(x,na.rm=TRUE)-(quantile(x, prob = 0.25,na.rm=TRUE)-IQR(x,na.rm=TRUE)*3)),2),
                                                           RT.shift=round(((quantile(x, prob = 0.75, na.rm=TRUE)+IQR(x,na.rm=TRUE)*3)-median(x,na.rm=TRUE)),2))))
          
          colnames(RT.add.stats)<-c("Total.ID","RT.median","RT.shfit.Lower", "RT.shfit.upper")
          report.RT.stats.add <- report.RT.stats[rownames(report.RT.stats)==metabolite.name,]
          
          report.RT.stats.add <- cbind(report.RT.stats.add[,1:4] ,RT.add.stats, CAS=report.RT.stats.add[,"CAS"], row.names = NULL)
          report.RT.stats.add$Name->report.RT.stats.add$Name_org
          # re-name
          for (i in 1:npeaks){
            report.RT.stats.add$Name[i] <- paste(metabolite.name, " (split peak ", i, ")", sep="")
          }
          report.split <- rbind(report.split, report.RT.stats.add)
          deletenamelist<-c(deletenamelist,metabolite.name)
          
        }          
        if (generate_rt_shift_graph){          
          names.var<- windows_filename(names(Remove.metabolite)[component])     
          
          png(filename=paste(save.folder,"/",names.var,".png",sep=""))
          plot(RT_libsimilarity$RT,log(as.numeric(RT_libsimilarity$INTEGRATEDAREA)),  ylab = "AREA (log)",
               xlab= "Retention time (min)", main = names(Remove.metabolite)[component], lwd=1.5)
          dev.off()
        }
        
      }
    }
    
    if(is.null(report.split)!=TRUE){
      
      deletenamelist=unique(deletenamelist)
      
      report.RT.statscombine<-report.RT.stats[!(report.RT.stats$Name %in% (deletenamelist)),]
      
      report.RT.resolved<-report.RT.stats[(report.RT.stats$Name %in% (deletenamelist)),]
      
      
      report.split=unique(report.split)
      report.split<-report.split[report.split$Total.ID>1,]
      report.split<-report.split[report.split$Name_org!="Unknown",]
      for ( i in 1:nrow(report.split)){
        peaks_list_bind_cpd<-peaks_list_bind[peaks_list_bind$Name==report.split$Name_org[i],]
        rtrow<-between(peaks_list_bind_cpd$RT,report.split$RT.median[i]-report.split$RT.shfit.Lower[i],report.split$RT.shfit.upper[i]+report.split$RT.median[i])
        report.split$Ref.ion[i]<-paste0(unique(peaks_list_bind_cpd$MODELION[rtrow]),collapse=";")
      }
      #report.split$Name_org<-NULL
      report.RT.statscombine$Name->report.RT.statscombine$Name_org
      report.RT.stats <- rbind(report.RT.statscombine, report.split)
    }
    
    report.RT.stats <- report.RT.stats[order(report.RT.stats$RT.median, decreasing = FALSE),]
    File.nameNIST <- "Summary_ID_report.csv"
    write.csv(report.RT.stats, file = File.nameNIST,row.names = FALSE)
    
    message(paste("Summary report was created and save in",workdir, sep=" "))
    message(paste("Summarizing area report from mgf report",workdir, sep=" "))
    
    summary_temp<-NULL
      report.RT.stats$RT.median<-as.numeric(report.RT.stats$RT.median)
      report.RT.stats$RT.shfit.upper<-as.numeric(report.RT.stats$RT.shfit.upper)
      report.RT.stats$RT.shfit.Lower<-as.numeric(report.RT.stats$RT.shfit.Lower)
    for (i in 1:nrow(report.RT.stats)){
      
      df<-NULL
      if(report.RT.stats$Total.ID[i]==1){
        peaks_list_bind[peaks_list_bind$RT == report.RT.stats$RT.first[i],]->df
        df[ df$MODELION==report.RT.stats$Ref.ion[i],]->df
        df$Name<-report.RT.stats$Name[i]
        summary_temp[[report.RT.stats$Name[i]]]<-df
      }else{
        peaks_list_bind[between(peaks_list_bind$RT ,
                        report.RT.stats$RT.median[i]-report.RT.stats$RT.shfit.Lower[i]-0.001,
                        report.RT.stats$RT.shfit.upper[i]+report.RT.stats$RT.median[i]+0.001),]->df
        df[ df$MODELION %in% str_split(report.RT.stats$Ref.ion[i],";")[[1]],]->df
        df$Name<-report.RT.stats$Name[i] 
        summary_temp[[report.RT.stats$Name[i]]]<-df
       }
        
      
        
    }
      summary_temp_bind<-do.call(rbind,summary_temp)
      summary_temp_bind$INTEGRATEDAREA<-as.numeric(summary_temp_bind$INTEGRATEDAREA)
      library(reshape2)
      summary_temp_bind_cast_ID_count<-dcast(summary_temp_bind,Name ~ file,value.var = "INTEGRATEDAREA",fun.aggregate = length)
      summary_temp_bind_cast_ID_sum<-dcast(summary_temp_bind,Name ~ file,value.var = "INTEGRATEDAREA",fun.aggregate = sum)
      
      File.nameNIST="Summary_ID_count_report.csv"
      write.csv(summary_temp_bind_cast_ID_count, file = File.nameNIST,row.names = FALSE)
      File.nameNIST="Summary_ID_area_report.csv"
      write.csv(summary_temp_bind_cast_ID_sum, file = File.nameNIST,row.names = FALSE)
      message(paste("Summarizing area report saved in",workdir, sep=" "))
      
}
#' MassHunter report file (.cefs) parsing and summary
#'
#' This is a function to process the AMDIS_report.txt file. specify a in-house library or a subset of NIST library to find the most intense quantification ion within the GC-MS experiment m/z range. The script will invoke selection windows to let the user specify the files location.  
#'
#' @param workdir locate the working Dir
#' @param MS.L specify the library file
#' @param MsLibrary specify the library file origin, could be "NIST" or "InHouse"
#' @param amdis.report specify the identification report file
#' @param Ret.Time.Filter set the retetion time filter window around the expected retention time (in +/-min).
#' @param RT.shift.limt set a threshole in seconds to define a molecule have 
#' @param mz_L lower mz limit for quantification ion selection
#' @param mz_U upper mz limit for quantification ion selection
#' @param generate_rt_shift_graph Set TRUE to generate the retetion time shift plots for each molecules that have retention time discrepencies greater than the \code{"RT.shift.limt"}
#' @param RTcorrection Set TRUE to enable a retention time correction before multiple peaks resolving
#' @return None
#'
#' @examples
#' MassHunter_id_Summary()
#'
#' @export
MassHunter_id_Summary<-function(workdir= NULL,
                           MS.L= NULL,
                           MassHunter.report = NULL,
                           File.name = "Summary Report.csv",
                           MsLibrary=c("NIST", "InHouse"),
                           Ret.Time.Filter=2.5,
                           RT.shift.limt = 60,
                           mz_L=38,
                           mz_U=550,
                           generate_rt_shift_graph=F,
                           generate_rt_shift_graphs=F,
                           RTcorrection=F
){
  library("tcltk")
  library("tcltk2")
  # require("Metab")
  library("pander")
  library("parallel")
  #library("OneR")
  library("dplyr")
  library("ggplot2")
  library("ggpubr")
  library(reticulate)
  #os <- import("os")
  #os$listdir(".")
  #py_install("fuzzywuzzy")
  #virtualenv_create("r-reticulate")
  #virtualenv_install("r-reticulate", "fuzzywuzzy")
  #system("pip install tk")
  if (is.null(workdir)){workdir= tk_choose.dir(caption = "Select working directory")}
  if (is.null(MassHunter.report)){MassHunter.report = tk_choose.dir(default = "",caption = "Select the MassHunter report folder contains .cef files")}
  if (is.null(MS.L)){MS.L= tk_choose.files(caption="Select MS library (e.g. SVB_total or NIST) in .msl",
                                           filter=matrix(c("MSL", ".msl", "All files", "*"),2, 2, byrow = TRUE))}
  
 #source_python(paste0(file.path(path.package(package="MassOmics")),"/src/masshunter-massOmics-summary-report.py"))
  #rm(parse_cef)
  #rm(return_df)
  setwd(workdir)
  source_python(paste0(file.path(path.package(package="MassOmics")),"/src/parse_Agilent_CEF.py"))
  return_df<-parse_cef(Folder = MassHunter.report)
  #workdir <- workdir
  
  
  

  
  
  setwd(workdir)
  MassHunter.report =as.data.frame (return_df)
  colnames(MassHunter.report)<-c("Name","n" ,"RT" , "ref","area","height","CAS" )
  message("MassHunter ID Summary parameters")
  message(paste("RT filter: ", Ret.Time.Filter, " min", sep=""))
  message(paste("Detect mutiple peaks when RT range is greater than: ", RT.shift.limt, " s", sep=""))
  message(paste("Quant mass will be selected within: ", mz_L, "-", mz_U,sep=""))
  message(paste("Perform RT correction before multi-peak resolving: ",RTcorrection ,sep=""))
  # Extract Reference ion, CAS # from MS library
  lib.txt<-readLines(MS.L)
  lib.txt1<-lib.txt[-grep("^[ ]*$",lib.txt)]
  att.nms<-unique(sapply(strsplit(lib.txt1[grep(":",lib.txt1)],":"),function(x) x[1]))
  entry.nms<-sapply(strsplit(lib.txt1[grep("NAME:",lib.txt1)],"ME:"), function(x) x[2])
  rez<-matrix(NA, nrow=length(entry.nms), ncol=length(att.nms), dimnames=list(NULL,att.nms ))
  starts<-grep("NAME:", lib.txt)
  stops<-c(starts[1:(length(starts)-1)]+diff(starts)-1,length(lib.txt))
  for (i in 1:length(starts)){
    tmp<-lib.txt[starts[i]:stops[i]]
    sapply(strsplit(tmp[grep(":",tmp)],":"), function(x) rez[i,x[1]]<<-x[2])
  }
  
  # ExtractIon for NIST library
  if(MsLibrary=="NIST"){
    message("Running in NIST mode, will generate Ref ions for each metabolites within mz detection range")
    substrRight <- function(x, n){
      substr(x, nchar(x)-n+1, nchar(x))
    }
    reference_ion<-NULL
    
    #test<-buildLib(AmdisLib=MS.L, folder=workdir, save = F, output = "ion_lib.csv", verbose = F,mz_L = mz_L,mz_U = mz_U)
    
    
    #reference_ion=parLapply(cl=autoStopCluster(makeCluster(detectCores()-1)), 1:length(starts),parse_msl_par,lib.txt,starts,stops,mz_L,mz_U)
    
    #reference_ion=unlist(reference_ion)
    
    #sapply(strsplit(tmp[grep(":",tmp)],":"), function(x) rez[i,x[1]]<<-x[2])
    reference_ion=0
    
    rez<-data.frame(rez,stringsAsFactors=FALSE)
    rez$NAME<-entry.nms
    libr<-cbind(rez,reference_ion)## returns list of peaks
    libr$NAME <- gsub("?", "", libr$NAME, fixed = TRUE)
    libr$NAME <- gsub("^ ", "", libr$NAME, perl=T)
    libr$NAME <- gsub("^ ", "", libr$NAME, perl=T)
    libr$NAME <- gsub("[*:%^!;$]", "", libr$NAME, perl=T)
    
    # Generate Summary report
    MassHunter.report <-MassHunter.report
    MassHunter.report$Name <- gsub("?", "", MassHunter.report$Name, fixed = TRUE)
    MassHunter.report$Name <- gsub("^ ", "", MassHunter.report$Name, perl=T)
    MassHunter.report$Name <- gsub("[*:%^!*?&;$]", "", MassHunter.report$Name, perl=T)
    MassHunter.report$Width <- 10
    #MassHunter.report$Width <- as.numeric(gsub("scans", "",  MassHunter.report$Width, perl=T))
    #MassHunter.reportcas<-MassHunter.report[MassHunter.report$CAS %in% libr$CASNO,]
    MassHunter.report<-MassHunter.report[MassHunter.report$Name %in% gsub("^ ","",libr$NAME),]
    #MassHunter.report1<-MassHunter.report[MassHunter.report$Name %in% libr$NAME,]
    #
    
    if (RTcorrection){
      library(ggpubr)
      RT.correction.grid<-RT_correction_xcms()
      RT.correction.result<-RT_correction(df=MassHunter.report)
      MassHunter.report<-RT.correction.result[[1]]
      ggpubr::ggarrange(RT.correction.result[[2]],RT.correction.result[[3]],
                        labels = c("Before","After"),common.legend = T,
                        ncol = 2, nrow =1,vjust=1.5) %>%  ggexport(filename = paste(getwd(),"/retention time correction.png",sep=""),width = 1800, height = 900,verbose = NULL,res = 100)
      
    }
    Metabolite.list <-split(MassHunter.report$RT, MassHunter.report$Name)
    RT.stats<-t(sapply(Metabolite.list,function(x) c(RT.median=round(median(x,na.rm=TRUE),3),
                                                     RT.shift=round((median(x,na.rm=TRUE)-(quantile(x, prob = 0.25,na.rm=TRUE)-IQR(x,na.rm=TRUE)*1.5))*60,2),
                                                     RT.shift=round(((quantile(x, prob = 0.75,na.rm=TRUE)+IQR(x,na.rm=TRUE)*1.5)-median(x,na.rm=TRUE))*60,2)
    )))
    colnames(RT.stats)<-c("RT.median","RT.shfit.Lower", "RT.shfit.upper")
    RT.stats=RT.stats[is.na(RT.stats[,"RT.median"])!=T,]
    Peak.width <- sapply(split(MassHunter.report$Width, MassHunter.report$Name), function(x) median(x))
    
    
    ID.stats<-t(sapply(split(MassHunter.report$height, MassHunter.report$Name),function(x) c(Library.match=round(mean(x,na.rm=T)),
                                                                                     Total.ID=length(x[!is.na(x)]))))
    #message("Running in NIST mode, will generate Ref ions for each metabolites within mz detection range")
    MassHunter.report.RT.stats <- data.frame(cbind(Name=0,Ref.ion=0, ID.stats, RT.stats, Peak.width, CAS=0))
    #message(paste(MassHunter.report.RT.stats$Ref.ion[1:40],sep="\n"))
    #MassHunter.report.RT.stats$Ref.ion <-  libr$reference_ion[match(strtrim(rownames(RT.stats),70),strtrim(libr$NAME,70))]
    #matchID=match(rownames(RT.stats),libr$NAME)
    matchID=match(strtrim(rownames(RT.stats),170),strtrim(libr$NAME,170))
    #MassHunter.report.RT.stats=MassHunter.report.RT.stats[which(!is.na(matchID)),]
    #matchID70=match(strtrim(rownames(RT.stats),70),strtrim(libr$NAME,70))
    #intersect(matchID,matchID70)
    MassHunter.report.RT.stats$Ref.ion <-  unlist(parallel::parLapply(cl=autoStopCluster(makeCluster(detectCores()-1)), 
                                                                 matchID,
                                                                 parse_msl_par,lib.txt,starts,stops,mz_L,mz_U))
    
    #for (ids in matchID){
    # Ref.iontest= parse_msl_par(ids,lib.txt,starts,stops,mz_L,mz_U)
    
    #}
    #match("2-Cyclopropylethynylcyclopropane",libr$NAME)
    #message(MassHunter.report.RT.stats$Ref.ion)
    #which(match(rownames(RT.stats),libr$NAME)==NA)
    MassHunter.report.RT.stats$CAS <-  libr$CAS[match(strtrim(rownames(RT.stats),70),strtrim(libr$NAME,70))]
    MassHunter.report.RT.stats$Name <- rownames(RT.stats)
    
    #total.n<-length(unique(MassHunter.report$FileName))
    #ID_name<-paste("Total.ID(n=",total.n,")",sep="" )
    #names(MassHunter.report.RT.stats)[3]<-ID_name
    
    
    ## Remove point outside the range
    RT.list<-lapply(Metabolite.list,function(x) c(RT.shift=(quantile(x, prob = 0.25,na.rm=TRUE)-IQR(x,na.rm=TRUE)*1.5),
                                                  RT.shift=(quantile(x, prob = 0.75,na.rm=TRUE)+IQR(x,na.rm=TRUE)*1.5)))
    Remove.outlier<-function(x,y){
      x[x >= y[1]]
      x[x <= y[2]]
    }
    
    Remove.metabolite <-mapply(Remove.outlier, Metabolite.list, RT.list )
    
    # Plot all retention time of all metabolites
    
    n.total <- length(Metabolite.list)
    png(paste(getwd(),"/retention time of all metabolites.png",sep=""),width = 30,height = 30, bg = "white",units = "in",res = 150)
    par(mar=c(5,10,5,5))
    boxplot(Metabolite.list, xlab= "Retention time (min)", cex.axis=0.2,  horizontal = TRUE, las=2, xaxt="n", main = paste("Retention time distribution of all metabolites ","(n=",n.total,")",sep=""))
    axis(1,cex.axis=1,las=1)
    dev.off()
    
    
    ## solve multimodel issue
    
    
    
    MassHunter.report.split<-NULL
    deletenamelist<-NULL
    save.folder<-paste("RTshift_warnings_",RT.shift.limt,"s_","shift",sep="")
    if (dir.exists(save.folder)!=T) try(dir.create (save.folder))
    
    for (component in 1:length(Remove.metabolite)){
      if(`&`((RT.stats[component,2] + RT.stats[component,3]) > RT.shift.limt, T)){
        #names(Remove.metabolite)[component]
        RT_libsimilarity=NULL
        #plot figure
        names.var<- windows_filename(names(Remove.metabolite)[component])
        RT_libsimilarity<-MassHunter.report[MassHunter.report$Name==names(Remove.metabolite)[component],]
        #nbins=6
        #res <- try(RT_libsimilarity$peakgroup<-OneR::bin(RT_libsimilarity$RT,nbins = nbins,labels = c(paste("Peak",1:nbins)), method = "clusters",na.omit = T),silent = TRUE)
        #if (class(res) != "try-error"){
        #while(`&`(sum(grep("Peak",RT_libsimilarity$peakgroup))==0,nbins>=1)){
        #  nbins=nbins-1
        #  res <- try(RT_libsimilarity$peakgroup<-OneR::bin(RT_libsimilarity$RT,nbins = nbins,labels = c(paste("Peak",1:nbins)), method = "clusters",na.omit = T),silent = TRUE)
        #}
        #RT_libsimilarity$peakgroup_RT_median=RT_libsimilarity$RT
        #if (sum(grep("Peak",RT_libsimilarity$peakgroup))!=0){
        #  RT_libsimilarity$peakgroup_RT_median=NULL
        #  RT_libsimilarity.stats<-RT_libsimilarity %>% group_by(peakgroup) %>% summarise(peakgroup_RT_median = median(RT))
        # RT_libsimilarity=merge(RT_libsimilarity,RT_libsimilarity.stats)
        
        # }
        #RT_libsimilarity<-MassHunter.report[MassHunter.report$Name==names(Remove.metabolite)[component],]
        #RT_libsimilarity$peakgroupRT=median(RT_libsimilarity$peakgroup)
        #RT_libsimilarity$series=1:nrow(RT_libsimilarity)
        #try(RT_libsimilarity<-RT_libsimilarity[order(RT_libsimilarity$FileName),])
        
        #estimate number of model
        m.var <- data.frame(Remove.metabolite[component])[,1]
        metabolite.name<-names(Remove.metabolite[component])
        res=NULL
        res <- try(density.all <- density(m.var, bw = "SJ"))
        if(class(res)=="try-error"){density.all <- density.default(m.var)}
        density.var <- density.all$y
        density.x <-  density.all$x[density.var>0.01]
        density.var <-  density.var[density.var>0.01]
        
        modes <- NULL
        centre.p <- NULL
        for ( i in 2:(length(density.var)-1) ){
          if ( (density.var[i] > density.var[i-1]) & (density.var[i] > density.var[i+1]) ) {
            modes <- c(modes,i)
          }
          if ( (density.var[i] < density.var[i-1]) & (density.var[i] < density.var[i+1]) ) {
            centre.p <- c(centre.p,i)
          }
        }
        
        new.median<- density.x[modes]
        new.boundary<- density.x [centre.p]
        npeaks <-length(new.median)
        
        if(npeaks>1){
          
          vComposti <- vector("list", npeaks)
          m.var.sub <- m.var
          
          for (i in 1:(npeaks-1)){
            vComposti[i] <-list(m.var.sub [m.var.sub <new.boundary[i]])
            m.var.sub[m.var.sub<new.boundary[i]]<-NA
          }
          
          vComposti[npeaks] <- list(m.var.sub)
          
          RT.add.stats <- t(sapply(vComposti,function(x) c(Total.ID=length(x[!is.na(x)]),
                                                           RT.median=round(median(x,na.rm=TRUE),3),
                                                           RT.shift=round((median(x,na.rm=TRUE)-(quantile(x, prob = 0.25,na.rm=TRUE)-IQR(x,na.rm=TRUE)*3))*60,2),
                                                           RT.shift=round(((quantile(x, prob = 0.75, na.rm=TRUE)+IQR(x,na.rm=TRUE)*3)-median(x,na.rm=TRUE))*60,2))))
          
          colnames(RT.add.stats)<-c("Total.ID","RT.median","RT.shfit.Lower", "RT.shfit.upper")
          MassHunter.report.RT.stats.add <- MassHunter.report.RT.stats[rownames(MassHunter.report.RT.stats)==metabolite.name,]
          MassHunter.report.RT.stats.add <- cbind(MassHunter.report.RT.stats.add[,1:3] ,RT.add.stats, MassHunter.report.RT.stats.add[,8:9], row.names = NULL)
          
          # re-name
          for (i in 1:npeaks){
            MassHunter.report.RT.stats.add$Name[i] <- paste(metabolite.name, " (split peak ", i, ")", sep="")
          }
          MassHunter.report.split <- rbind(MassHunter.report.split, MassHunter.report.RT.stats.add)
          deletenamelist<-c(deletenamelist,metabolite.name)
          
        }          
        if (generate_rt_shift_graph){          
          #png(filename=paste(save.folder,"/",names.var,".png",sep=""),width = 480,height = 720,)
          #par(mfrow = c(2, 1))
          #png(filename=paste(save.folder,"/",names.var,"lib_score.png",sep=""),width = 900,height = 1200)
          #par(mfrow = c(2, 3))
          #RT_libsimilarity<-MassHunter.report[MassHunter.report$Name==names(Remove.metabolite)[i],]
          #RT_libsimilarity$peakgroup="Outliner"
          #RT_libsimilarity$peakgroup_RT_median=RT_libsimilarity$RT
          
          names.var<- windows_filename(names(Remove.metabolite)[component])          
          
          
          #boxplotlibsim=as_ggplot(boxplot(Weighted~peakgroup,data=RT_libsimilarity, main=names(Remove.metabolite)[i], xlab="Retention time (min)", ylab="Library Score"))
          
          #rtdensityplot=plot(data.frame(Remove.metabolite[i])[,1],1:length(data.frame(Remove.metabolite[i])[,1]),  ylab = "Density",xlab= "Retention time (min)", main = names(Remove.metabolite)[i], lwd=1.5)
          #dev.off()
          #plot figure
          
          png(filename=paste(save.folder,"/",names.var,".png",sep=""))
          #png(filename=paste(save.folder,"/","aaa",".png",sep=""))
          
          plot(RT_libsimilarity$RT,RT_libsimilarity$Net,  ylab = "Net score",
               xlab= "Retention time (min)", main = names(Remove.metabolite)[component], lwd=1.5)
          dev.off()
          
          
          #png(filename=paste(save.folder,"/",names.var,"_lib_score.png",sep=""),width = 400, height = 400)
          
          
        }
        if (generate_rt_shift_graphs){          
          #png(filename=paste(save.folder,"/",names.var,".png",sep=""),width = 480,height = 720,)
          #par(mfrow = c(2, 1))
          #png(filename=paste(save.folder,"/",names.var,"lib_score.png",sep=""),width = 900,height = 1200)
          #par(mfrow = c(2, 3))
          #RT_libsimilarity<-MassHunter.report[MassHunter.report$Name==names(Remove.metabolite)[i],]
          RT_libsimilarity$peakgroup="Outliner"
          RT_libsimilarity$peakgroup_RT_median=RT_libsimilarity$RT
          for (peak in 1: nrow(MassHunter.report.RT.stats.add)){
            RT_libsimilarity$peakgroup=ifelse(`&`(RT_libsimilarity$RT>=(MassHunter.report.RT.stats.add$RT.median[peak]-MassHunter.report.RT.stats.add$RT.shfit.Lower[peak]/60),
                                                  RT_libsimilarity$RT<=(MassHunter.report.RT.stats.add$RT.median[peak]+MassHunter.report.RT.stats.add$RT.shfit.upper[peak]/60)),paste("Peak",peak),RT_libsimilarity$peakgroup)
            RT_libsimilarity$peakgroup_RT_median=ifelse(`&`(RT_libsimilarity$RT>=(MassHunter.report.RT.stats.add$RT.median[peak]-MassHunter.report.RT.stats.add$RT.shfit.Lower[peak]/60),
                                                            RT_libsimilarity$RT<=(MassHunter.report.RT.stats.add$RT.median[peak]+MassHunter.report.RT.stats.add$RT.shfit.upper[peak]/60)),MassHunter.report.RT.stats.add$RT.median[peak],"")
          }
          names.var<- windows_filename(names(Remove.metabolite)[component])          
          boxplotlibsim <- ggplot(RT_libsimilarity, aes(x=peakgroup, y=Net, fill=peakgroup)) + 
            geom_boxplot() +  geom_jitter(shape=16, position=position_jitter(0.2)) + theme(
              panel.background = element_rect(fill = "white"),
              plot.margin = ggplot2::margin(1, 1, 1, 1, "cm"),
              plot.background = element_rect(
                fill = "grey90",
                colour = "black"
              ))
          
          #boxplotlibsim=as_ggplot(boxplot(Weighted~peakgroup,data=RT_libsimilarity, main=names(Remove.metabolite)[i], xlab="Retention time (min)", ylab="Library Score"))
          
          #rtdensityplot=plot(data.frame(Remove.metabolite[i])[,1],1:length(data.frame(Remove.metabolite[i])[,1]),  ylab = "Density",xlab= "Retention time (min)", main = names(Remove.metabolite)[i], lwd=1.5)
          #dev.off()
          
          
          #png(filename=paste(save.folder,"/","aaa",".png",sep=""))
          
          Net.plot=ggplot(RT_libsimilarity, aes(x=RT, y=Net, color=peakgroup,size=2)) + geom_point() + theme(
            panel.background = element_rect(fill = "white"),
            plot.margin = ggplot2::margin(1, 1, 1, 1, "cm"),
            plot.background = element_rect(
              fill = "grey90",
              colour = "black"
            ))
          
          ggpubr::ggarrange(boxplotlibsim,Net.plot,
                            labels = c("boxplot","Net score"),
                            ncol = 2, nrow =1,vjust=1.5) %>%  ggexport(filename = paste(save.folder,"/",names.var,"_lib_score.png",sep=""),width = 900, height = 400,verbose = NULL)
          #dev.off()
        }
      }
    }
    
    if(is.null(MassHunter.report.split)!=TRUE){
      
      deletenamelist=unique(deletenamelist)
      
      MassHunter.report.RT.statscombine<-MassHunter.report.RT.stats[!(MassHunter.report.RT.stats$Name %in% (deletenamelist)),]
      
      MassHunter.report.split=unique(MassHunter.report.split)
      
      MassHunter.report.RT.stats <- rbind(MassHunter.report.RT.statscombine, MassHunter.report.split)
    }
    
    MassHunter.report.RT.stats <- MassHunter.report.RT.stats[order(MassHunter.report.RT.stats$RT.median, decreasing = FALSE),]
    if ("CAS" %in% names(MassHunter.report.RT.stats)) {
      library(stringr)
      MassHunter.report.RT.stats$CAS<-gsub(" ","",MassHunter.report.RT.stats$CAS)
      MassHunter.report.RT.stats$CAS<-gsub("-","",MassHunter.report.RT.stats$CAS)
      MassHunter.report.RT.stats_CAS_width<-str_length(MassHunter.report.RT.stats$CAS)
      cas_last<-str_sub(MassHunter.report.RT.stats$CAS,MassHunter.report.RT.stats_CAS_width,MassHunter.report.RT.stats_CAS_width)
      cas_last_23<-str_sub(MassHunter.report.RT.stats$CAS,MassHunter.report.RT.stats_CAS_width-2,MassHunter.report.RT.stats_CAS_width-1)
      cas_last_rest<-str_sub(MassHunter.report.RT.stats$CAS,1,MassHunter.report.RT.stats_CAS_width-3)
      casfinal<-data.frame(a=cas_last_rest,b=cas_last_23,c=cas_last,stringsAsFactors = F)
      cas_final<-unlist(lapply(1:nrow(casfinal),function(x,casfinal){
        str_glue(casfinal$a[x],"-",casfinal$b[x],"-",casfinal$c[x])
      },casfinal))
      MassHunter.report.RT.stats$CAS<-cas_final
    }
    File.nameNIST <- "Summary report_NIST.csv"
    write.csv(MassHunter.report.RT.stats, file = File.nameNIST,row.names = FALSE)
    write.csv(libr, file = "Library summary.csv",row.names = FALSE)
    
    
    
    
    message(paste("Summary report_NIST.csv was created and save in",workdir, sep=" "))
  } else if (MsLibrary=="InHouse") {
    
    rez<-data.frame(rez,stringsAsFactors=FALSE)
    rez$NAME<-entry.nms
    libr<-rez## returns list of peaks
    
    # Generate summary report
    MassHunter.report<-MassHunter.report
    MassHunter.report<-MassHunter.report[MassHunter.report$RT-MassHunter.report$Expec..RT < Ret.Time.Filter,]
    MassHunter.report<-MassHunter.report[MassHunter.report$RT-MassHunter.report$Expec..RT > -Ret.Time.Filter,]
    MassHunter.report$Name <- gsub("?", "", MassHunter.report$Name, fixed = TRUE)
    MassHunter.report$Name <- gsub("^ ", "", MassHunter.report$Name, perl=T)
    MassHunter.report$Name <- gsub("[*:%^!*?&;$]", "", MassHunter.report$Name, perl=T)
    MassHunter.report$Width <- gsub(">", "",  MassHunter.report$Width, perl=T)
    MassHunter.report$Width <- as.numeric(gsub("scans", "",  MassHunter.report$Width, perl=T))
    
    
    
    Metabolite.list <-split(MassHunter.report$RT, MassHunter.report$Name)
    
    RT.stats<-t(sapply(Metabolite.list,function(x) c(RT.median=round(median(x,na.rm=TRUE),3),
                                                     RT.shift=round((median(x,na.rm=TRUE)-(quantile(x, prob = 0.25)-IQR(x,na.rm=TRUE)*1.5))*60,2),
                                                     RT.shift=round(((quantile(x, prob = 0.75)+IQR(x,na.rm=TRUE)*1.5)-median(x,na.rm=TRUE))*60,2))))
    colnames(RT.stats)<-c("RT.median","RT.shfit.Lower", "RT.shfit.upper")
    
    Peak.width <- sapply(split(MassHunter.report$Width, MassHunter.report$Name), function(x) median(x))
    
    
    RT.stats[is.na(RT.stats)]<-0
    ID.stats<-t(sapply(split(MassHunter.report$Weighted, MassHunter.report$Name),function(x) c(Library.match=round(mean(x,na.rm=T)),
                                                                                     Total.ID=length(x[!is.na(x)]))))
    MassHunter.report.RT.stats<-NULL
    MassHunter.report.RT.stats <- as.data.frame(cbind(Name=0,Ref.ion=0,ID.stats,RT.stats,Expec.RT=0,Diff.RT=0, Peak.width, CAS=0))
    MassHunter.report.RT.stats$Ref.ion <- libr$RSN[match(rownames(RT.stats),libr$NAME)]
    MassHunter.report.RT.stats$Expec.RT <- round(as.numeric(libr$RT[match(rownames(RT.stats),libr$NAME)]),2)
    MassHunter.report.RT.stats$CAS <- libr$CASNO[match(rownames(RT.stats),libr$NAME)]
    MassHunter.report.RT.stats$Diff.RT <- MassHunter.report.RT.stats$Expec.RT- as.numeric(MassHunter.report.RT.stats$RT.median)
    MassHunter.report.RT.stats$Name <- rownames(RT.stats)
    
    
    
    
    
    ## Remove point outside the range
    RT.list<-lapply(Metabolite.list,function(x) c(RT.shift=(quantile(x, prob = 0.25)-IQR(x,na.rm=TRUE)*1.5),
                                                  RT.shift=(quantile(x, prob = 0.75)+IQR(x,na.rm=TRUE)*1.5)))
    Remove.outlier<-function(x,y){
      x[x >= y[1]]
      x[x <= y[2]]
    }
    
    Remove.metabolite <-mapply(Remove.outlier, Metabolite.list, RT.list )
    
    # Plot all retention time of all metabolites
    n.total <- length(Metabolite.list)
    par(mar=c(5,10,5,5))
    boxplot(Metabolite.list, xlab= "Retention time (min)", cex.axis=0.5,  horizontal = TRUE, las=2, xaxt="n", main = paste("Retention time distribution of all metabolites ","(n=",n.total,")",sep=""))
    axis(1,cex.axis=1,las=1)
    
    
    ## solve multimodel issue
    MassHunter.report.split<-NULL
    
    save.folder<-paste("RTshift_warnings_",RT.shift.limt,"s_","shift",sep="")
    if (dir.exists(save.folder)!=T) try(dir.create (save.folder))
    for (i in 1:length(Remove.metabolite)){
      if(RT.stats[i,2] + RT.stats[i,3] > RT.shift.limt ){
        
        # plot figure
        names.var<- gsub("/", "", names(Remove.metabolite)[i], perl=T)
        png(filename=paste(save.folder,"/",names.var,".png",sep=""))
        plot(data.frame(Remove.metabolite[i])[,1],1:length(data.frame(Remove.metabolite[i])[,1]),  ylab = "Density",
             xlab= "Retention time (min)", main = names(Remove.metabolite)[i],lwd=1.5)
        dev.off()
        
        #estimate number of model
        m.var <- data.frame(Remove.metabolite[i])[,1]
        metabolite.name<-names(Remove.metabolite[i])
        
        density.all <- density(m.var, bw = "SJ")
        density.var <- density(m.var, bw = "SJ")$y
        density.x <-  density.all$x[density.var>0.01]
        density.var <-  density.var[density.var>0.01]
        
        modes <- NULL
        centre.p <- NULL
        for ( i in 2:(length(density.var)-1) ){
          if ( (density.var[i] > density.var[i-1]) & (density.var[i] > density.var[i+1]) ) {
            modes <- c(modes,i)
          }
          if ( (density.var[i] < density.var[i-1]) & (density.var[i] < density.var[i+1]) ) {
            centre.p <- c(centre.p,i)
          }
        }
        
        new.median<- density.x[modes]
        new.boundary<- density.x [centre.p]
        npeaks <-length(new.median)
        
        if(npeaks>1){
          
          vComposti <- vector("list", npeaks)
          m.var.sub <- m.var
          
          for (i in 1:(npeaks-1)){
            vComposti[i] <-list(m.var.sub [m.var.sub <new.boundary[i]])
            m.var.sub[m.var.sub<new.boundary[i]]<-NA
          }
          
          vComposti[npeaks] <- list(m.var.sub)
          
          RT.add.stats <- t(sapply(vComposti,function(x) c(Total.ID=length(x[!is.na(x)]),
                                                           RT.median=round(median(x,na.rm=TRUE),3),
                                                           RT.shift=round((median(x,na.rm=TRUE)-(quantile(x, prob = 0.25,na.rm=TRUE)-IQR(x,na.rm=TRUE)*3))*60,2),
                                                           RT.shift=round(((quantile(x, prob = 0.75, na.rm=TRUE)+IQR(x,na.rm=TRUE)*3)-median(x,na.rm=TRUE))*60,2))))
          
          colnames(RT.add.stats)<-c("Total.ID","RT.median","RT.shfit.Lower", "RT.shfit.upper")
          MassHunter.report.RT.stats.add <- MassHunter.report.RT.stats[rownames(MassHunter.report.RT.stats)==metabolite.name,]
          MassHunter.report.RT.stats.add <- cbind(MassHunter.report.RT.stats.add[,1:3] ,RT.add.stats, MassHunter.report.RT.stats.add[,8:11], row.names = NULL)
          
          # re-name
          for (i in 1:npeaks){
            MassHunter.report.RT.stats.add$Name[i] <- paste(metabolite.name, " (split peak ", i, ")", sep="")
          }
          MassHunter.report.split <- rbind(MassHunter.report.split, MassHunter.report.RT.stats.add)
        }
      }
    }
    
    if(is.null(MassHunter.report.split)!=TRUE){
      MassHunter.report.RT.stats <- rbind(MassHunter.report.RT.stats, MassHunter.report.split)
    }
    
    MassHunter.report.RT.stats <- MassHunter.report.RT.stats[order(MassHunter.report.RT.stats$RT.median, decreasing = FALSE),]
    File.name <- tclvalue(tkgetSaveFile(initialfile=File.name))
    write.csv(MassHunter.report.RT.stats, file = File.name,row.names = FALSE)
    print(paste("MassHunter summary report.csv is generated and save in",workdir, sep=" "))
  }
}


#' Peak diagnosis by overlay multiple chromatograms
#'
#'The user could visually confirm all peaks, and determine whether the peak is due to background noise by selecting <Overlay All Chromatograms> and clicking the <RUN> button in the box of step 3. A popout window will show a figure that plots chromatograms for all identified metabolites. You can also adjust the font size of metabolite names if the display is inappropriate. You must disable the <Fast Mode> to plot overlay of all chromatograms.
#'
#' @param font.size Size of report font
#' 
#' @return None
#'
#' @examples
#' PeakDiagnosis()
#'
#' @export
PeakDiagnosis <- function(path="GC-Peak Diagnosis Report.csv",font.size=0.5){
  if (!file.exists(path)){path=tcltk::tk_choose.files(default ="GC-Peak Diagnosis Report.csv", caption="choose the Peak Diagnosis Report.csv",multi = F)}
  IntegrationReport<-read.csv(path,stringsAsFactors = F)
  require(lattice)
  dev.new.OS()
  if(exists("redwarning")){
    xyp<-xyplot(Intensity~Retention.Time| factor(Metabolite.Names), groups=Sample.Names, type="l", data=IntegrationReport,
                plot.points=FALSE, scales = list(x ="free",y="free"), as.table=TRUE,
                main="GC-Peak overlay chromatogram",
                xlab="Rentention Time (min)",
                ylab="Peak Intensity",
                par.strip.text=list(cex=font.size),
                par.settings = list(axis.text=list(cex=0.5)),
                
                
                strip = function(..., which.panel, bg) {
                  bg.col = redwarning
                  strip.default(..., which.panel = which.panel,
                                bg = rep(bg.col, length = which.panel)[which.panel])}
                
    )
    print(xyp)
  } else {
    IntegrationReport<-read.csv(path,stringsAsFactors = F)
    dev.new.OS()
    
    xyp<-xyplot(Intensity~Retention.Time| factor(Metabolite.Names), groups=Sample.Names, type="l", data=IntegrationReport,
                plot.points=FALSE, scales = list(x ="free",y="free"), as.table=TRUE,
                main="GC-Peak Integration Report",
                par.strip.text=list(cex=font.size),
                par.settings = list(strip.background = list(col = "gray95"),axis.text=list(cex=0.5))
    )
    print(xyp)
  }
}

multimodel<-function(data,nbins=5,labels=1:nbins,method=c("clusters","length", "content"),na.omit=T){
  library("OneR")
  library("dplyr")
  
  
  
  
}

RT_correction<-function(df,topN=0.90,RT_std_limit=0.002,RT_drift_limit=0.2){
  
  library(dplyr)
  library(dplyr)
  library(ggplot2)
  df = data.table::data.table(df,stringsAsFactors = F)
  
  dfsum <- df %>% dplyr::group_by(Name) %>% dplyr::summarise(unqiuefilenum=length(unique(FileName)),medianRT=median(RT),stdRT=std.outliner.rm(RT),sdRT=sd(RT))
  
  dfsum$stdRT=ifelse(is.na(dfsum$stdRT),0,dfsum$stdRT)
  
  dfsum<-dfsum[dfsum$unqiuefilenum>=topN*max(dfsum$unqiuefilenum),]
  
  dfsum<-dfsum[dfsum$stdRT<=RT_std_limit,]
  
  dfdatafilesum<-df %>% dplyr::group_by(FileName,Name) %>% dplyr::summarise(medianRT=median(as.numeric(RT)))
  
  dfdatafilesum<-dfdatafilesum[dfdatafilesum$Name %in% dfsum$Name,]
  
  
  
  dfdatafilesum$Name=as.factor(dfdatafilesum$Name)
  
  beforecorrection<-ggplot2::ggplot(dfdatafilesum,aes(x=medianRT,y=FileName,color=Name)) + geom_point(size=1) +
    theme(axis.text.y=element_blank())
  
  
  
  df$newRT<- parSapply(cl=autoStopCluster(makeCluster(try(detectCores()))),1:nrow(df),RT_correction_LORESS,df,dfsum)
  df$newRT<-as.numeric(unlist(df$newRT))
  df$rtdiff=abs(df$RT-df$newRT)
  df$finalRT=ifelse('&'(!is.na(df$newRT),df$rtdiff<RT_drift_limit),df$newRT,df$RT)
  df$RT=df$finalRT
  
  dfdatafilesum<-df %>% dplyr::group_by(FileName,Name) %>% dplyr::summarise(medianRT=median(as.numeric(RT)))
  
  dfdatafilesum<-dfdatafilesum[dfdatafilesum$Name %in% dfsum$Name,]
  
  dfdatafilesum$Name=as.factor(dfdatafilesum$Name)
  aftercorrection<-ggplot2::ggplot(dfdatafilesum,aes(x=medianRT,y=FileName,color=Name)) + geom_point(size=1) +
    theme(axis.text.y=element_blank())
  
  
  return(list(df,beforecorrection,aftercorrection))
  }

RT_correction_LORESS<-function(i,df,dfsum){
  
  #df$FileName[i]
  #df$Name[i]
  refdf=df['&'(df$FileName==df$FileName[i],df$Name %in% dfsum$Name),]
  refcpd=unique(dfsum$Name[which(abs(dfsum$medianRT-df$RT[i])==min(abs(dfsum$medianRT-df$RT[i])))])
  refRTmedian=dfsum$medianRT[dfsum$Name %in% refcpd]
  refRT=unique(refdf$RT[refdf$Name %in% refcpd])
  RTdiff=(refRTmedian-refRT)[abs(refRTmedian-refRT)==min(abs(refRTmedian-refRT))]
  if (length(df$RT[i]+(RTdiff))==1) {
  return(df$RT[i]+(RTdiff))  
  }else{return("")}
  
  
}

RT_correction_xcms<-function(datafiles=NULL){
  library(tcltk)
  library(stringr)
  library(xcms)
  library(RColorBrewer)
  if(is.null(datafiles)){datafiles=tk_choose.files(caption = "select the MS data files for RT adjustment")}
  matchtype=c("mzxml$","cdf$")
  #datafiles=gsub(paste0(dirname(datafiles)[1],"/"),"",datafiles)

  datafiles=datafiles[grep(pattern =paste(matchtype,collapse="|"),x=datafiles,ignore.case = T)]
  message("Reading raw data...")
  raw_data <- readMSData(datafiles, mode = "onDisk", msLevel = 1,centroided =T, verbose = F )
  message("Reading raw data finished")
  mfp <- MatchedFilterParam(snthresh = 30, binSize = .5)
  message("Finding features...")
  closeAllConnections()
  cl=makeCluster(12)
  res <- findChromPeaks(raw_data, param = mfp)
  #head(chromPeaks(res))
  #table(chromPeaks(res)[, "sample"])
  
  ## Performing the peak grouping using the "peak density" method.
  p <- PeakDensityParam(sampleGroups = rep(1, length(datafiles)),maxFeatures=1000,minFraction = 0.5,minSamples = 1,binSize = 0.25)
  res <- groupChromPeaks(res, param = p)
  
  foverlap=overlappingFeatures(res, expandRt = 60)
  ## Perform the retention time adjustment using peak groups found in both
  ## files.minFraction equal to the fraction required as being a principle compound
  fgp <- xcms::PeakGroupsParam(minFraction = 0.95)
  ## Before running the alignment we can evaluate which features (peak groups)
  ## would be used based on the specified parameters.
  pkGrps <- xcms::adjustRtimePeakGroups(res, param = fgp)
  #pkGrps <- xcms::adjustRtime(res, param = fgp)
  ## We can also plot these to evaluate if the peak groups span a large portion
  ## of the retention time range.
  pkGrps<-pkGrps/60
  pkGrps<-data.frame(pkGrps,stringsAsFactors = F)
  pkGrpscolor=1:nrow(pkGrps)

  
  
  #axis(side = 2, at = c(1:ncol(pkGrps)), labels = colnames(pkGrps),cex.lab=0.5)
  
  ## Next we perform the alignment.
  res <- adjustRtime(res, param = fgp)
  
  ## Any grouping information was dropped
  #hasFeatures(res)
  
  ## Adjusterd retention times can be accessed using
  ## rtime(object, adjusted = TRUE) and adjustedRtime
  #all.equal(rtime(res), adjustedRtime(res))
  
  ## To get the raw, unadjusted retention times:
  #all.equal(rtime(res, adjusted = FALSE), rtime(raw_data))
  
  ## To extract the retention times grouped by sample/file:
  rts <- rtime(res, bySample = T)
  rtsraw <- rtime(raw_data)
  

   sampleftlength=sapply(rts,length) 
   rtsrawlist<-list()
   for (subset in 1: length(sampleftlength)){
     if (subset!=1){
      start=sum(sampleftlength[1:subset-1]) 
     }else{
      start=0
     }
     
     rtsrawlist[[as.character(subset)]]=rtsraw[(1+start):(sampleftlength[subset]+start)]
     
   }
  finalrts_sum<-function(rts){  
   finalrts<-data.frame()
  for (list in names(rts)){
    
    a<-rts[[list]]
    fid=names(a)
    fid=t(data.frame(str_split(fid,"\\.")))
    filenum=as.numeric(gsub("F","",fid[1,1]))
  a<-as.data.frame(rts[[list]]) 
  a$fID= fid[,2]
  colnames(a)=c(as.character(res@phenoData@data[["sampleNames"]][as.numeric(filenum)]),"fID")
  if (length(finalrts)==0){
  finalrts=a
  }else{
  finalrts=merge(finalrts,a,by="fID",all=TRUE) 
  }
  } 
   finalrts
     }
  finalrts=finalrts_sum(rts)
  finalrtsraw=finalrts_sum(rtsrawlist)
  pkGrpsadj=finalrts[(gsub("FT","",rownames(pkGrps))),]
  pkGrpsadj=pkGrpsadj[,-1]
  pkGrpsraw=finalrtsraw[(gsub("FT","",rownames(pkGrps))),]
  pkGrpsraw=pkGrpsraw[,-1]
  
  png("RTadjustment_QC.png",width = 15,height = 15,units = "in",res = 150)
  par(mfrow = c(2,2))
  
  plot(x = pkGrps[, 1], y = rep(1, nrow(pkGrps)), xlim =range(pkGrps,na.rm=T), col= pkGrpscolor,ylim=c(1,ncol(pkGrps)) ,xlab = "rt", ylab = "Data Files", yaxt = "n", main= "Representative feature RT")
  for (col in 2:ncol(pkGrps)){
    points(x = pkGrps[, col], y = rep(col, nrow(pkGrps)),col= pkGrpscolor)  
    segments(x0 = pkGrps[, col-1], x1 = pkGrps[, col],
             y0 = rep(col-1, nrow(pkGrps)), y1 = rep(col, nrow(pkGrps)),col= pkGrpscolor)  
  }
  grid()
  
  
  
  
  ## Plot the raw against the adjusted retention times.
  plot((rtime(raw_data)/60), (rtime(res)/60), pch = 16, cex = 0.25, col = fromFile(res),xlab = "Raw", ylab = "Adjusted",  main= "RT comparision before and after")
  
   plot(x = pkGrpsraw[, 1], y = rep(1, nrow(pkGrpsraw)), xlim =range(pkGrpsraw,na.rm=T), col= pkGrpscolor,ylim=c(1,ncol(pkGrpsraw)) ,xlab = "rt", ylab = "Data Files", yaxt = "n", main= "Before RT adjustment")
  for (col in 2:ncol(pkGrpsraw)){
    points(x = pkGrpsraw[, col], y = rep(col, nrow(pkGrpsraw)),col= pkGrpscolor)  
    segments(x0 = pkGrpsraw[, col-1], x1 = pkGrpsraw[, col],
             y0 = rep(col-1, nrow(pkGrpsraw)), y1 = rep(col, nrow(pkGrpsraw)),col= pkGrpscolor)  
  }
  grid()
  plot(x = pkGrpsadj[, 1], y = rep(1, nrow(pkGrpsadj)), xlim =range(pkGrpsadj,na.rm=T), col= pkGrpscolor,ylim=c(1,ncol(pkGrpsadj)) ,xlab = "rt", ylab = "Data Files", yaxt = "n", main= "After RT adjustment")
  for (col in 2:ncol(pkGrpsadj)){
    points(x = pkGrpsadj[, col], y = rep(col, nrow(pkGrpsadj)),col= pkGrpscolor)  
    segments(x0 = pkGrpsadj[, col-1], x1 = pkGrpsadj[, col],
             y0 = rep(col-1, nrow(pkGrpsadj)), y1 = rep(col, nrow(pkGrpsadj)),col= pkGrpscolor)  
  }
  grid()
  
 
  
  mtext("RTadjustment Quality control", side = 3, line = -2, outer = TRUE)
  dev.off()
  write.csv(finalrts,"finalRTadjustment_matrix.csv")
  return(list(finalrtsraw,finalrts))
}

Par_peakintegrate<-function(h,library_file,findScanTime,raw_data,intensity_type,Ion.bin){
  
  scanrangeL <- min(findScanTime@scantime)
  scanrangeU <- max(findScanTime@scantime)
  mzacqrangeL <- min(findScanTime@mzrange)
  mzacqrangeU <- max(findScanTime@mzrange)
  
  metabolite <-  library_file[h, ]
  R.Ion<-metabolite$Ref.ion
  RT.lowerset<-metabolite$RT.shfit.Lower
  RT.upperset<-metabolite$RT.shfit.upper
  R.Time<-metabolite$RT.median*60
  
  EIClower=R.Time-RT.lowerset
  EICupper=R.Time+RT.upperset
  EICmzlower=R.Ion-Ion.bin
  EICmzupper=R.Ion+Ion.bin
  if((R.Time-RT.lowerset)<=scanrangeL) EIClower=scanrangeL
  if((R.Time+RT.upperset)>=scanrangeU) EICupper=scanrangeU
  if((R.Ion-Ion.bin)<=mzacqrangeL) EICmzlower=mzacqrangeL
  if((R.Ion+Ion.bin)>=mzacqrangeU) EICmzupper=mzacqrangeU
  
  if (sum(EICupper<scanrangeL,EIClower>scanrangeU,(EICmzlower)>mzacqrangeU,(EICmzupper)<mzacqrangeL,na.rm = T)>=1) {
    Peakvalue=0
    return(Peakvalue)
  }else{
    
    IonExtract <-xcms::getEIC(raw_data,mzrange=cbind(EICmzlower,EICmzupper),rtrange=cbind(EIClower,EICupper))
    abundance<-data.frame(IonExtract@eic$xcmsRaw[1])
    ifelse(intensity_type=="Peak Height", Peakvalue <- max(abundance$intensity) ,Peakvalue<-flux::auc(abundance$rt, abundance$intensity*10))
    return(Peakvalue)
  }
  
}

Par_peakintegrate_slow<-function(h,library_file,findScanTime,raw_data,intensity_type,Ion.bin,name.file){
  
  
  scanrangeL <- min(findScanTime@scantime)
  scanrangeU <- max(findScanTime@scantime)
  mzacqrangeL <- min(findScanTime@mzrange)
  mzacqrangeU <- max(findScanTime@mzrange)
  
  metabolite <-  library_file[h, ]
  R.Ion<-metabolite$Ref.ion
  RT.lowerset<-metabolite$RT.shfit.Lower
  RT.upperset<-metabolite$RT.shfit.upper
  R.Time<-metabolite$RT.median*60
  
  EIClower=R.Time-RT.lowerset
  EICupper=R.Time+RT.upperset
  EICmzlower=R.Ion-Ion.bin
  EICmzupper=R.Ion+Ion.bin
  if((R.Time-RT.lowerset)<=scanrangeL) EIClower=scanrangeL
  if((R.Time+RT.upperset)>=scanrangeU) EICupper=scanrangeU
  if((R.Ion-Ion.bin)<=mzacqrangeL) EICmzlower=mzacqrangeL
  if((R.Ion+Ion.bin)>=mzacqrangeU) EICmzupper=mzacqrangeU
  
  if (sum(EICupper<scanrangeL,EIClower>scanrangeU,(EICmzlower)>mzacqrangeU,(EICmzupper)<mzacqrangeL,na.rm = T)>=1) {
    Peakvalue=0
    abundance=data.frame(rt=NA,intensity=NA)
  }else{
    
    IonExtract <-xcms::getEIC(raw_data,mzrange=cbind(EICmzlower,EICmzupper),rtrange=cbind(EIClower,EICupper))
    abundance<-data.frame(IonExtract@eic$xcmsRaw[1])
    ifelse(intensity_type=="Peak Height", Peakvalue <- max(abundance$intensity) ,Peakvalue<-flux::auc(abundance$rt, abundance$intensity*10))
    
  }
  
  result = list(Peakvalue=Peakvalue,Graphic.df=data.frame(Metabolite.Names=paste(metabolite$Name," (m/z:",R.Ion,")",sep="") ,Retention.Time=(abundance$rt)/60,
                                                          Intensity=abundance$intensity ,Sample.Names=name.file ))
  return(result)
}

testpar<-function(){
  for( h in 1: nrow(library_file)){
  
  scanrangeL <- min(findScanTime@scantime)
  scanrangeU <- max(findScanTime@scantime)
  mzacqrangeL <- min(findScanTime@mzrange)
  mzacqrangeU <- max(findScanTime@mzrange)
  
  metabolite <-  library_file[h, ]
  R.Ion<-metabolite$Ref.ion
  RT.lowerset<-metabolite$RT.shfit.Lower
  RT.upperset<-metabolite$RT.shfit.upper
  R.Time<-metabolite$RT.median*60
  
  EIClower=R.Time-RT.lowerset
  EICupper=R.Time+RT.upperset
  EICmzlower=R.Ion-Ion.bin
  EICmzupper=R.Ion+Ion.bin
  if((R.Time-RT.lowerset)<=scanrangeL) EIClower=scanrangeL
  if((R.Time+RT.upperset)>=scanrangeU) EICupper=scanrangeU
  if((R.Ion-Ion.bin)<=mzacqrangeL) EICmzlower=mzacqrangeL
  if((R.Ion+Ion.bin)>=mzacqrangeU) EICmzupper=mzacqrangeU
  
  if (sum(EICupper<scanrangeL,EIClower>scanrangeU,(EICmzlower)>mzacqrangeU,(EICmzupper)<mzacqrangeL,na.rm = T)>=1) {
    Peakvalue=0
    
  }else{
    
    IonExtract <-xcms::getEIC(raw_data,mzrange=cbind(EICmzlower,EICmzupper),rtrange=cbind(EIClower,EICupper))
    abundance<-data.frame(IonExtract@eic$xcmsRaw[1])
    ifelse(intensity_type=="Peak Height", Peakvalue <- max(abundance$intensity) ,Peakvalue<-flux::auc(abundance$rt, abundance$intensity*10))
    
  }
  
  result = list(Peakvalue=Peakvalue,Graphic.df=data.frame(Metabolite.Names=paste(metabolite$Name," (m/z:",R.Ion,")",sep="") ,Retention.Time=(abundance$rt)/60,
                                                          Intensity=abundance$intensity ,Sample.Names=name.file ))
  
}
  
}

adjustRtimePeakGroups <- function(object, param = PeakGroupsParam(),
                                  msLevel = 1L) {
  if (!is(object, "XCMSnExp"))
    stop("'object' has to be an 'XCMSnExp' object.")
  if (!hasFeatures(object))
    stop("No features present. Please run 'groupChromPeaks' first.")
  if (hasAdjustedRtime(object))
    warning("Alignment/retention time correction was already performed, ",
            "returning a matrix with adjusted retention times.")
  subs <- subset(param)
  if (!length(subs))
    subs <- seq_along(fileNames(object))
  nSamples <- length(subs)
  pkGrp <- .getPeakGroupsRtMatrix(
    peaks = chromPeaks(object, msLevel = msLevel),
    peakIndex = .peakIndex(
      .update_feature_definitions(featureDefinitions(object),
                                  rownames(chromPeaks(object)),
                                  rownames(chromPeaks(object,
                                                      msLevel = msLevel)))),
    sampleIndex = subs,
    missingSample = nSamples - (nSamples * minFraction(param)),
    extraPeaks = extraPeaks(param)
  )
  colnames(pkGrp) <- basename(fileNames(object))[subs]
  pkGrp
}

.getPeakGroupsRtMatrix <- function(peaks, peakIndex, sampleIndex,
                                   missingSample, extraPeaks) {
  ## For each feature:
  ## o extract the retention time of the peak with the highest intensity.
  ## o skip peak groups if they are not assigned a peak in at least a
  ##   minimum number of samples OR if have too many peaks from the same
  ##   sample assigned to it.
  nSamples <- length(sampleIndex)
  rt <- lapply(peakIndex, function(z) {
    cur_fts <- peaks[z, c("rt", "into", "sample"), drop = FALSE]
    ## Return NULL if we've got less samples that required or is the total
    ## number of peaks is larger than a certain threshold.
    ## Note that the original implementation is not completely correct!
    ## nsamp > nsamp + extraPeaks might be correct.
    nsamp <- length(unique(cur_fts[, "sample"]))
    if (nsamp < (nSamples - missingSample) |
        nrow(cur_fts) > (nsamp + extraPeaks))
      return(NULL)
    cur_fts[] <- cur_fts[order(cur_fts[, 2], decreasing = TRUE), ]
    cur_fts[match(sampleIndex, cur_fts[, 3]), 1]
  })
  rt <- do.call(rbind, (rt))
  ## Order them by median retention time. NOTE: this is different from the
  ## original code, in which the peak groups are ordered by the median
  ## retention time that is calculated over ALL peaks within the peak
  ## group, not only to one peak selected for each sample (for multi
  ## peak per sample assignments).
  ## Fix for issue #175
  if (is(rt, "matrix")) {
    rt <- rt[order(rowMedians(rt, na.rm = TRUE)), , drop = FALSE]
  }
  rt
}
.peakIndex <- function(object) {
  if (inherits(object, "DataFrame")) {
    idxs <- object$peakidx
    names(idxs) <- rownames(object)
  } else {
    if (!hasFeatures(object))
      stop("No feature definitions present. Please run groupChromPeaks first.")
    idxs <- featureDefinitions(object)$peakidx
    names(idxs) <- rownames(featureDefinitions(object))
  }
  idxs
}
.update_feature_definitions <- function(x, original_names, subset_names) {
  x$peakidx <- lapply(x$peakidx, function(z) {
    idx <- base::match(original_names[z], subset_names)
    idx[!is.na(idx)]
  })
  x[lengths(x$peakidx) > 0, ]
}


erah_pipeline<-function(workdir=getwd(),infopath=NULL){
  
  library(erah)
  if (is.null(infopath)){tk_choose.files(caption = "Select info file to normalized by SERRF")}
    #check if it is csv of xlsx
  if(grepl(".xls.?$", infopath)){
    pData <- openxlsx::read.xlsx(infopath, sheet = 1,colNames = T)
  }else if(grepl(".csv", infopath)){
    # file = "C:\\Users\\Sili Fan\\Downloads\\val (18).csv"
    if(nrow(d)<2){pData <- read.csv(infopath)}
    pData <- data.table::fread(infopath)
  }
    
  
  
}

xcmsRaw_dev<-function (filename, profstep = 1, profmethod = "bin", profparam = list(), 
          includeMSn = FALSE, mslevel = NULL, scanrange = NULL) 
{
  object <- new("xcmsRaw")
  object@env <- new.env(parent = .GlobalEnv)
  object@filepath <- xcmsSource(filename)
  rawdata <- loadRaw(object@filepath, includeMSn = includeMSn)
  rtdiff <- diff(rawdata$rt)
  if (any(rtdiff == 0)) 
    warning("There are identical scantimes.")
  if (any(rtdiff < 0)) {
    badtimes <- which(rtdiff < 0)
    stop(paste("Time for scan ", badtimes[1], " (", 
               rawdata$rt[[badtimes[1]]], ") greater than scan ", 
               badtimes[1] + 1, " (", rawdata$rt[[badtimes[1] + 
                                                    1]], ")", sep = ""))
  }
  object@scantime <- rawdata$rt
  object@tic <- rawdata$tic
  object@scanindex <- rawdata$scanindex
  object@env$mz <- rawdata$mz
  object@env$intensity <- rawdata$intensity
  if (length(scanrange) < 2) {
    scanrange <- c(1, length(object@scantime))
  }else {
    scanrange <- range(scanrange)
  }
  if (min(scanrange) < 1 | max(scanrange) > length(object@scantime)) {
    scanrange[1] <- max(1, scanrange[1])
    scanrange[2] <- min(length(object@scantime), scanrange[2])
    message("Provided scanrange was adjusted to ", 
            scanrange[1], " - ", scanrange[2])
  }
  if (!is.null(rawdata$acquisitionNum)) {
    object@acquisitionNum <- rawdata$acquisitionNum
  }
  if (!is.null(rawdata$polarity)) {
    object@polarity <- factor(rawdata$polarity, levels = c(0, 
                                                           1, -1), labels = c("negative", "positive", 
                                                                              "unknown"))
  }
  if (!is.null(rawdata$MSn)) {
    object@env$msnMz <- rawdata$MSn$mz
    object@env$msnIntensity <- rawdata$MSn$intensity
    object@msnScanindex <- rawdata$MSn$scanindex
    object@msnAcquisitionNum <- rawdata$MSn$acquisitionNum
    object@msnLevel <- rawdata$MSn$msLevel
    object@msnRt <- rawdata$MSn$rt
    object@msnPrecursorScan <- match(rawdata$MSn$precursorNum, 
                                     object@acquisitionNum)
    object@msnPrecursorMz <- rawdata$MSn$precursorMZ
    object@msnPrecursorIntensity <- rawdata$MSn$precursorIntensity
    object@msnPrecursorCharge <- rawdata$MSn$precursorCharge
    object@msnCollisionEnergy <- rawdata$MSn$collisionEnergy
  }
  xcms:::scanrange(object) <- as.numeric(scanrange)
  object <- object[scanrange[1]:scanrange[2]]
  xcms:::mslevel(object) <- as.numeric(mslevel)
  object@mzrange <- range(object@env$mz, na.rm = TRUE)
  object@profmethod <- profmethod
  object@profparam <- profparam
  if (profstep) 
    xcms:::profStep(object) <- profstep
  if (!missing(mslevel) & !is.null(mslevel)) {
    if (max(mslevel) > 1) {
      object <- msn2ms(object)
      object <- split(object, f = object@msnLevel == mslevel)$"TRUE"
    }
  }
  return(object)
}
MASHUOA/MassOmics documentation built on Nov. 3, 2023, 10:48 p.m.