R/buildRecord.R

Defines functions renderPeaks .selfDefinedAccessionBuilder .standardAccessionBuilder .simpleAccessionBuilder .buildRecord.RmbSpectrum2 getAnalyticalInfo .buildRecord.RmbSpectraSet

Documented in .buildRecord.RmbSpectraSet getAnalyticalInfo

# TODO: Add comment
# 
# Author: stravsmi
###############################################################################
#' @import assertthat


#' @export
setGeneric("buildRecord", function(o, ...) standardGeneric("buildRecord"))

#' Compile MassBank records
#' 
#' Takes a spectra block for a compound, as returned from
#' \code{\link{analyzeMsMs}}, and an aggregated cleaned peak table, together
#' with a MassBank information block, as stored in the infolists and loaded via
#' \code{\link{loadInfolist}}/\code{\link{readMbdata}} and processes them to a
#' MassBank record
#' 
#' \code{compileRecord} calls \code{\link{gatherCompound}} to create blocks of
#' spectrum data, and finally fills in the record title and accession number,
#' renames the "internal ID" comment field and removes dummy fields.
#' 
#' @usage compileRecord(spec, mbdata, aggregated, additionalPeaks = NULL, retrieval="standard")
#' @param spec A \code{RmbSpectraSet} for a compound, after analysis (\code{\link{analyzeMsMs}}).
#' Note that \bold{peaks are not read from this
#' object anymore}: Peaks come from the \code{aggregated} dataframe (and from
#' the global \code{additionalPeaks} dataframe; cf. \code{\link{addPeaks}} for
#' usage information.)
#' @param mbdata The information data block for the record header, as stored in
#' \code{mbdata_relisted} after loading an infolist.
#' @param aggregated An aggregated peak data table containing information about refiltered spectra etc.
#' @param additionalPeaks If present, a table with additional peaks to add into the spectra.
#' 		As loaded with \code{\link{addPeaks}}.
#' @param retrieval A value that determines whether the files should be handled either as "standard",
#' if the compoundlist is complete, "tentative", if at least a formula is present or "unknown"
#' if the only know thing is the m/z
#' @return Returns a MassBank record in list format: e.g.
#' \code{list("ACCESSION" = "XX123456", "RECORD_TITLE" = "Cubane", ...,
#' "CH\$LINK" = list( "CAS" = "12-345-6", "CHEMSPIDER" = 1111, ...))}
#' @author Michael Stravs
#' @seealso \code{\link{mbWorkflow}}, \code{\link{addPeaks}},
#' \code{\link{gatherCompound}}, \code{\link{toMassbank}}
#' @references MassBank record format:
#' \url{http://www.massbank.jp/manuals/MassBankRecord_en.pdf}
#' @examples
#' 
#' #
#' \dontrun{myspec <- w@@spectra[[2]]}
#' # after having loaded an infolist:
#' \dontrun{mbdata <- mbdata_relisted[[which(mbdata_archive\$id == as.numeric(myspec\$id))]]}
#' \dontrun{compiled <- compileRecord(myspec, mbdata, w@@aggregated)}
#' 

.buildRecord.RmbSpectraSet <- function(cpd, ..., mbdata = list(), additionalPeaks = NULL)
{
	# gather the individual spectra data
	analyticalInfo <- getAnalyticalInfo(cpd)

	# Go through all child spectra, and add metadata to all the info slots
	# Pass them the AC_LC and AC_MS data, which are added at the right place
	# directly in there.
	allSpectra <- lapply(cpd@children, function(s)
				buildRecord(s, ..., cpd=cpd, mbdata=mbdata, analyticalInfo=analyticalInfo, 
            additionalPeaks=additionalPeaks))
	allSpectra <- allSpectra[which(!is.na(allSpectra))]
	if(length(allSpectra) > 0)
		cpd@children <- as(allSpectra, "SimpleList")
	else
		cpd@children <- new("RmbSpectrum2List")
  cpd
}

#' @export
setMethod("buildRecord", "RmbSpectraSet", function(o, ..., mbdata = list(), additionalPeaks = NULL)
      .buildRecord.RmbSpectraSet(cpd=o, ..., mbdata = mbdata, additionalPeaks = additionalPeaks)
    )



# For each compound, this function creates the "lower part" of the MassBank record, i.e.
# everything that comes after AC$INSTRUMENT_TYPE.
#' Compose data block of MassBank record
#' 
#' \code{gatherCompound} composes the data blocks (the "lower half") of all
#' MassBank records for a compound, using the annotation data in the RMassBank
#' options, spectrum info data from the \code{analyzedSpec}-type record and the
#' peaks from the reanalyzed, multiplicity-filtered peak table. It calls
#' \code{gatherSpectrum} for each child spectrum.
#' 
#' The returned data blocks are in format \code{list( "AC\$MASS_SPECTROMETRY" =
#' list('FRAGMENTATION_MODE' = 'CID', ...), ...)} etc.
#' 
#' @aliases gatherCompound gatherSpectrum
#' @usage gatherCompound(spec, aggregated, additionalPeaks = NULL, retrieval="standard")
#' 
#' 		gatherSpectrum(spec, msmsdata, ac_ms, ac_lc, aggregated, 
#'	 		additionalPeaks = NULL, retrieval="standard")
#' @param spec A \code{RmbSpectraSet} object, representing a compound with multiple spectra.
#' @param aggregated An aggregate peak table where the peaks are extracted from.
#' @param msmsdata A \code{RmbSpectrum2} object from the \code{spec} spectra set, representing a single spectrum to give a record.
#' @param ac_ms,ac_lc Information for the AC\$MASS_SPECTROMETRY and
#' AC\$CHROMATOGRAPHY fields in the MassBank record, created by
#' \code{gatherCompound} and then fed into \code{gatherSpectrum}.
#' @param additionalPeaks If present, a table with additional peaks to add into the spectra.
#' 		As loaded with \code{\link{addPeaks}}.
#' @param retrieval A value that determines whether the files should be handled either as "standard",
#' if the compoundlist is complete, "tentative", if at least a formula is present or "unknown"
#' if the only know thing is the m/z
#' @return \code{gatherCompound} returns a list of tree-like MassBank data
#' blocks. \code{gatherSpectrum} returns one single MassBank data block or
#' \code{NA} if no useful peak is in the spectrum. 
#' @note Note that the global table \code{additionalPeaks} is also used as an
#' additional source of peaks.
#' @author Michael Stravs
#' @seealso \code{\link{mbWorkflow}}, \code{\link{compileRecord}}
#' @references MassBank record format:
#' \url{http://www.massbank.jp/manuals/MassBankRecord_en.pdf}
#' @examples \dontrun{
#'      myspectrum <- w@@spectra[[1]]
#' 		massbankdata <- gatherCompound(myspectrum, w@@aggregated)
#' 		# Note: ac_lc and ac_ms are data blocks usually generated in gatherCompound and
#' 		# passed on from there. The call below gives a relatively useless result :)
#' 		ac_lc_dummy <- list()
#' 		ac_ms_dummy <- list() 
#' 		justOneSpectrum <- gatherSpectrum(myspectrum, myspectrum@@child[[2]],
#' 			ac_ms_dummy, ac_lc_dummy, w@@aggregated)
#' }
#' 
#' 
#' @export
getAnalyticalInfo <- function(cpd = NULL)
{
  ai <- list()
	# define positive or negative, based on processing mode.
  if(!is.null(cpd))
	  mode <- .ionModes[[cpd@mode]]
	
  # again, these constants are read from the options:
  ai[['AC$INSTRUMENT']] <- getOption("RMassBank")$annotations$instrument
  ai[['AC$INSTRUMENT_TYPE']] <- getOption("RMassBank")$annotations$instrument_type

	# for format 2.01
	ac_ms <- list();
	ac_ms[['MS_TYPE']] <- getOption("RMassBank")$annotations$ms_type
	ac_ms[['ION_MODE']] <- mode
	ac_ms[['IONIZATION']] <- getOption("RMassBank")$annotations$ionization
	
	# This list could be made customizable.
	ac_lc <- list();
  if(!is.null(cpd))
	  rt  <- cpd@parent@rt / 60
	ac_lc[['COLUMN_NAME']] <- getOption("RMassBank")$annotations$lc_column
	ac_lc[['FLOW_GRADIENT']] <- getOption("RMassBank")$annotations$lc_gradient
	ac_lc[['FLOW_RATE']] <- getOption("RMassBank")$annotations$lc_flow
	ac_lc[['RETENTION_TIME']] <- sprintf("%.3f min", rt)  
	ac_lc[['SOLVENT A']] <- getOption("RMassBank")$annotations$lc_solvent_a
	ac_lc[['SOLVENT B']] <- getOption("RMassBank")$annotations$lc_solvent_b
	
	# Treutler fixes for custom properties, trying to forwardport this here
	
	## add generic AC$MASS_SPECTROMETRY information
	properties      <- names(getOption("RMassBank")$annotations)
	presentProperties <- names(ac_ms)#c('MS_TYPE', 'IONIZATION', 'ION_MODE')#, 'FRAGMENTATION_MODE', 'COLLISION_ENERGY', 'RESOLUTION')
	
	theseProperties <- grepl(x = properties, pattern = "^AC\\$MASS_SPECTROMETRY_")
	properties2     <- gsub(x = properties, pattern = "^AC\\$MASS_SPECTROMETRY_", replacement = "")
	theseProperties <- theseProperties & !(properties2 %in% presentProperties)
	theseProperties <- theseProperties & (unlist(getOption("RMassBank")$annotations) != "NA")
	ac_ms[properties2[theseProperties]] <- unlist(getOption("RMassBank")$annotations[theseProperties])
	
	## add generic AC$CHROMATOGRAPHY information
	#properties      <- names(getOption("RMassBank")$annotations)
	theseProperties <- grepl(x = properties, pattern = "^AC\\$CHROMATOGRAPHY_")
	properties2     <- gsub(x = properties, pattern = "^AC\\$CHROMATOGRAPHY_", replacement = "")
	presentProperties <- names(ac_lc)#c('COLUMN_NAME', 'FLOW_GRADIENT', 'FLOW_RATE', 'RETENTION_TIME', 'SOLVENT A', 'SOLVENT B')
	theseProperties <- theseProperties & !(properties2 %in% presentProperties)
	theseProperties <- theseProperties & (unlist(getOption("RMassBank")$annotations) != "NA")
	ac_lc[properties2[theseProperties]] <- unlist(getOption("RMassBank")$annotations[theseProperties])
	

	
	return(list( ai=ai, ac_lc=ac_lc, ac_ms=ac_ms))
}


# Process one single MSMS child scan.
# spec: an object of "analyzedSpectrum" type (i.e. contains 
#       14x (or other number) msmsdata, info, mzrange,
#       compound ID, parent MS1, cpd id...)
# msmsdata: the msmsdata sub-object from the spec which is the child scan we want to process.
#       Contains childFilt, childBad, scan #, etc. Note that the peaks are actually not
#       taken from here! They were taken from msmsdata initially, but after introduction
#       of the refiltration and multiplicity filtering, this was changed. Now only the
#       scan information is actually taken from msmsdata.
# ac_ms, ac_lc: pre-filled info for the MassBank dataset (see above)
# refiltered: the refilteredRcSpecs dataset which contains our good peaks :)
#       Contains peaksOK, peaksReanOK, peaksFiltered, peaksFilteredReanalysis, 
#       peaksProblematic. Currently we use peaksOK and peaksReanOK to create the files.
#       (Also, the global additionalPeaks table is used.)
#' @export
setMethod("buildRecord", "RmbSpectrum2", function(o, ..., cpd = NULL, mbdata = list(), analyticalInfo = list(), additionalPeaks = NULL)
      .buildRecord.RmbSpectrum2(spectrum = o, cpd=cpd, mbdata=mbdata, analyticalInfo=analyticalInfo, additionalPeaks=additionalPeaks, ...)
)

.buildRecord.RmbSpectrum2 <- function(spectrum, ..., cpd = NULL, mbdata = list(), analyticalInfo = list(), additionalPeaks = NULL)
{

	if(length(analyticalInfo$ac_ms) > 0)
		ac_ms=analyticalInfo$ac_ms
	else
		ac_ms=list()

	if(length(analyticalInfo$ac_lc) > 0)
		ac_lc=analyticalInfo$ac_lc
	else
		ac_lc=list()

	if(length(mbdata) == 0)
	{
		if(is.null(cpd))
			mbdata <- gatherDataMinimal.spectrum(spectrum)
		else
			mbdata <- gatherDataMinimal.cpd(cpd)
	}

	if(length(analyticalInfo$ai) > 0)
		mbdata <- c(mbdata, analyticalInfo$ai)

	# If the spectrum is not filled, return right now. All "NA" spectra will
	# not be treated further.
	# If step 2 was not performed, instead, spectrum@ok is empty and we want to export it, so proceed.
	if(length(spectrum@ok) > 0)
	{
		if(spectrum@ok == FALSE)
			return(NA)
	}
	# get data
	scan <- spectrum@acquisitionNum


	# Further fill the ac_ms datasets, and add the ms$focused_ion with spectrum-specific data:
	ac_ms[['FRAGMENTATION_MODE']] <- spectrum@info$mode
	#ac_ms['PRECURSOR_TYPE'] <- precursor_types[spec$mode]
	if(length(spectrum@info$ce) > 0)
		ac_ms[['COLLISION_ENERGY']] <- spectrum@info$ce
	else
		ac_ms[['COLLISION_ENERGY']] <- spectrum@collisionEnergy
	ac_ms[['RESOLUTION']] <- spectrum@info$res

	# Calculate exact precursor mass with Rcdk, and find the base peak from the parent
	# spectrum. (Yes, that's what belongs here, I think.)

	ms_fi <- list()
	if(!is.null(cpd))
	{
		ms_fi[['BASE_PEAK']] <- round(mz(cpd@parent)[which.max(intensity(cpd@parent))],4)
		ms_fi[['PRECURSOR_M/Z']] <- round(cpd@mz,4)
		ms_fi[['PRECURSOR_TYPE']] <- .precursorTypes[cpd@mode]

		if(all(!is.na(spectrum@precursorIntensity), 
		   spectrum@precursorIntensity != 0, 
		   spectrum@precursorIntensity != 100, na.rm = TRUE))
			ms_fi[['PRECURSOR_INTENSITY']] <- spectrum@precursorIntensity
	}


	# Create the "lower part" of the record.  

	# Add the AC$MS, AC$LC info.
	if(getOption("RMassBank")$use_version == 2)
	{
		if(length(ac_ms) >0)
			mbdata[["AC$MASS_SPECTROMETRY"]] <- ac_ms
		if(length(ac_lc) >0)
			mbdata[["AC$CHROMATOGRAPHY"]] <- ac_lc
	}
	else
	{
		# Fix for MassBank data format 1, where ION_MODE must be renamed to MODE
		ac <- c(ac_ms, ac_lc)
		if(length(ac) > 0)
		{
			mbdata[["AC$ANALYTICAL_CONDITION"]] <- ac
			names(mbdata[["AC$ANALYTICAL_CONDITION"]])[[
			  which(names(mbdata[["AC$ANALYTICAL_CONDITION"]]) == "ION_MODE")
			]] <- "MODE"
		}
	}
	# Add the MS$FOCUSED_ION info.
	if(length(ms_fi) > 0)
		mbdata[["MS$FOCUSED_ION"]] <- ms_fi

	## The SPLASH is a hash value calculated across all peaks
	## http://splash.fiehnlab.ucdavis.edu/
	## Has to be temporarily added as "PK$SPLASH" in the "lower" part
	## of the record, but will later be moved "up" when merging parts in compileRecord()  

	# the data processing tag :)
	# Change by Tobias:
	# I suggest to add here the current version number of the clone due to better distinction between different makes of MB records
	# Could be automatised from DESCRIPTION file?
	if(getOption("RMassBank")$use_rean_peaks)
		processingComment <- list("REANALYZE" = "Peaks with additional N2/O included")
	else
		processingComment <- list()
	mbdata[["MS$DATA_PROCESSING"]] <- c(
	  getOption("RMassBank")$annotations$ms_dataprocessing,
	  processingComment,
	  list("WHOLE" = paste("RMassBank", packageVersion("RMassBank")))
	)

	if(length(spectrum@info$ces) > 0)
		mbdata[['RECORD_TITLE_CE']] <- spectrum@info$ces
	else
		mbdata[['RECORD_TITLE_CE']] <- spectrum@collisionEnergy

	# Mode of relative scan calculation: by default it is calculated relative to the
	# parent scan. If a corresponding option is set, it will be calculated from the first
	# present child scan in the list.

	if(!is.null(cpd))
	{
		relativeScan <- "fromParent"
		if(!is.null(getOption("RMassBank")$recomputeRelativeScan))
			if(getOption("RMassBank")$recomputeRelativeScan == "fromFirstChild")
				relativeScan <- "fromFirstChild"
		if(relativeScan == "fromParent")
			subscan <- spectrum@acquisitionNum - cpd@parent@acquisitionNum #relative scan
		else if(relativeScan == "fromFirstChild")
		{
			firstChild <- min(unlist(lapply(cpd@children,function(d) d@acquisitionNum)))
			subscan <- spectrum@acquisitionNum - firstChild + 1
		}
	}


	# Here is the right place to fix the name of the INTERNAL ID field.
	if(!is.null(getOption("RMassBank")$annotations$internal_id_fieldname))
	{
		id.col <- which(names(mbdata[["COMMENT"]]) == "ID")
		if(length(id.col) > 0)
		{
			names(mbdata[["COMMENT"]])[[id.col]] <-
			getOption("RMassBank")$annotations$internal_id_fieldname
		}
	}
	# get mode parameter (for accession number generation) depending on version 
	# of record definition
	# Generate the title and then delete the temprary RECORD_TITLE_CE field used before
	mbdata[["RECORD_TITLE"]] <- .parseTitleString(mbdata)
	mbdata[["RECORD_TITLE_CE"]] <- NULL
	# Calculate the accession number from the options.
	userSettings = getOption("RMassBank")
	# Use a user-defined accessionBuilder, if present
	if("accessionBuilderType" %in% names(userSettings))
	{
		assert_that(userSettings$accessionBuilderType %in% c(
		  "standard", "simple", "selfDefined"),
		  msg=paste("accessionNumberType must be one of",
		  "'standard', 'simple' or 'selfDefined'"))
		mbdata[['ACCESSION']] <- switch(
		  userSettings$accessionBuilderType,
		  simple = .simpleAccessionBuilder(subscan),
		  standard = .standardAccessionBuilder(cpd, subscan),
		  selfDefined = .selfDefinedAccessionBuilder(cpd, spectrum,
		  subscan)
		)
	}
	else
	{
		mbdata[['ACCESSION']] <- .standardAccessionBuilder(cpd, subscan)
	}

	spectrum@info <- mbdata

	spectrum <- renderPeaks(spectrum, cpd=cpd, additionalPeaks=additionalPeaks, ...)

	return(spectrum)
}

.simpleAccessionBuilder <- function(subscan)
{
	userSettings = getOption("RMassBank")
	assert_that('accessionNumberStart' %in% names(userSettings),
	  msg=paste("accessionBuilderType is 'simple', but",
	  "accessionNumberStart is not provided.",
	  "You may set the value of accessionBuilderType to",
	  "'standard' or 'selfDefined' to use a different accessionBuilder.", 
	  "For detailed explanations of accessionBuilders, check out the",
	  "'Settings' section of the RMassBank vignette."))
	sprintf('%s%06d', userSettings$annotations$entry_prefix,
	  subscan + userSettings$accessionNumberStart)
}

.standardAccessionBuilder <- function(cpd, subscan)
{
	userSettings = getOption("RMassBank")
	shift <- userSettings$accessionNumberShifts[[cpd@mode]]
	sprintf("%s%04d%02d", userSettings$annotations$entry_prefix,
	  as.numeric(cpd@id), subscan+shift)
}
	
.selfDefinedAccessionBuilder <- function(cpd, spectrum, subscan)
{
	#This is a wrapper for the user-defined accessionBuilder
	userSettings = getOption("RMassBank")
	assert_that("accessionBuilderFile" %in% names(userSettings),
	  msg=paste("accessionBuilderType is 'selfDefined', but",
	  "accessionBuilderFile is not provided.",
	  "You may set the value of accessionBuilderType",
	  "to 'standard' or 'simple' to use a different accessionBuilder.", 
	  "For detailed explanations of accessionBuilders, check out the",
	  "'Settings' section of the RMassBank vignette."))
	source(userSettings$accessionBuilderFile)
	#The file must contain a function called 'accessionBuilder'
	#with arguments cpd, spectrum, subscan
	assert_that(exists("accessionBuilder"),
	  msg=paste('No accessionBuilder defined in',
	  userSettings$accessionBuilderFile))
	assert_that(class(accessionBuilder)=='function',
	  msg='accessionBuilder must be a function')
	assert_that(has_args(accessionBuilder,
	  c('cpd', 'spectrum', 'subscan'), exact=TRUE),
	  msg=paste('accessionBuilder must have function arguments',
	  'cpd, spectrum, subscan in this order'))
	accessionBuilder(cpd, spectrum, subscan)
}
renderPeaks <- function(spectrum, ..., cpd = NULL, additionalPeaks = NULL)
{
	# Select all peaks which belong to this spectrum (correct cpdID and scan no.)
	# from peaksOK
	# Note: Here and below it would be easy to customize the source of the peaks.
	# Originally the peaks came from msmsdata$childFilt, and the subset
	# was used where dppm == dppmBest (because childFilt still contains multiple formulas)
	# per peak.
  spectrum <- .fillSlots(spectrum, c("good", "dppm", "dppmBest", "mzCalc", "formula", "formulaCount"))
  peaks <- getData(spectrum)
  property(spectrum, "best", addNew=TRUE, "logical") <- (peaks$good %in% TRUE) & (peaks$dppm == peaks$dppmBest)
  spectrum <- normalize(spectrum, 999, slot="intrel", ...)
  peaks <- getData(selectPeaks(spectrum, ...))
  # filterOK is the final criterion for selection, it includes both reanalyzed and original matches.
  # If there was no peak filtering performed, use best | matchedReanalysis (which gets both regular and reanalyzed matches)
  # To get peaks without the reanalyzed matches, use best
  # rawOK gives the unfiltered, not denoised spectrum.
  # Any other condition can be used (also for example intrel > 50)
  
  if(!getOption("RMassBank")$use_rean_peaks)
    peaks <- peaks[peaks$formulaSource == "analyze",,drop=FALSE]
  
  
	# No peaks? Aha, bye
	if(nrow(peaks) == 0)
		return(NA)
	
	# Calculate relative intensity and make a formatted m/z to use in the output
	# (mzSpec, for "spectrum")
	#peaks$intrel <- floor(peaks$intensity / max(peaks$intensity) * 999)

	# reorder peaks after addition of the reanalyzed ones


	# copy the peak table to the annotation table. (The peak table will then be extended
	# with peaks from the global "additional_peaks" table, which can be used to add peaks
	# to the spectra by hand.

	
	# Here add the additional peaks if there are any for this compound!
	# They are added without any annotation.
  if(is.null(additionalPeaks))
    additionalPeaks <- data.frame()
  
  if(!is.null(cpd))
  {
    if(ncol(additionalPeaks) > 0)
    {
      # select the peaks from the corresponding spectrum which were marked with "OK=1" in the table.
      spec_add_peaks <- additionalPeaks[
          (!is.na(additionalPeaks$OK)) &
              (additionalPeaks$OK == 1) & 
              (as.character(additionalPeaks$cpdID) == cpd@id) &
              (additionalPeaks$scan == spectrum@acquisitionNum),
          c("mzFound", "intensity")]
      # If there are peaks to add:
      if(nrow(spec_add_peaks)>0)
      {
        colnames(spec_add_peaks) <- c("mz", "intensity")
        # bind tables together. First add in NA fillers for all columns not in spec_add_peaks
        for(column in setdiff(colnames(peaks), colnames(spec_add_peaks)))
          spec_add_peaks[,column] <- new(class(peaks[,column]), NA)
        #print(spec_add_peaks)
        peaks <- rbind(peaks, spec_add_peaks[,colnames(peaks),drop=FALSE])
        # recalculate rel.int.  and reorder list
      }
    }
  }
  
  # recalculate relative intensity with the newly added peaks. Note that possibly this leads to spectra
  # with intrel < what was specified in the original filter!
  peaks$intrel <- floor(peaks$intensity / max(peaks$intensity) * 999)
  peaks <- peaks[order(peaks$mz),]
  
	# build annotation
  spectrum <- setData(spectrum, peaks) 

	return(spectrum)
}
sneumann/RMassBank documentation built on Oct. 20, 2020, 3:19 p.m.