#' List TopDown files
#'
#' List all TopDown files:
#' - .fasta (peptide sequence)
#' - .mzML (spectra)
#' - .experiments.csv (fragmentation conditions)
#' - .txt (header information)
#'
#' @param path file path
#' @param pattern filename pattern
#' @return list (splitted by file extension) with file path
#' @noRd
.listTopDownFiles <- function(path, pattern=".*") {
files <- list.files(
path,
pattern=paste0(pattern, "(", .topDownFileExtRx("cfmt"), ")"),
recursive=TRUE,
full.names=TRUE
)
l <- split(normalizePath(files), .fileExt(files))
n <- lengths(l)
ext <- c("csv", "fasta", "mzML", "txt")
if (!length(n) || any(!ext %in% names(l))) {
stop(
"Could not find any ",
paste0(ext[!ext %in% names(l)], collapse=", "), " files!"
)
}
if (n["fasta"] > 1L) {
stop("More than one fasta file found. Consider the 'pattern' argument.")
}
if (!all(n["csv"] == n[!grepl("fasta", names(n))])) {
nd <- n[!grepl("fasta", names(n))]
stop(
"There have to be the same number of csv, mzML and txt files. ",
"Found: ", paste0(names(nd), "=", nd, collapse=", ")
)
}
l
}
#' Read ScanHeadMans method (experiments.csv) output.
#'
#' 1 row per condition
#'
#' @param file character, filename
#' @param verbose logical, verbose output?
#' @return data.frame
#' @noRd
.readExperimentCsv <- function(file, verbose=interactive()) {
if (.fileExt(file) != "csv")
stop("'file' has to be a csv file with the extension '.csv'")
d <- read.csv(file, na.strings=c("NA", "N/A"), stringsAsFactors=FALSE)
colnames(d) <- .camelCase(colnames(d))
.msg(verbose,
"Reading ", nrow(d), " experiment conditions from file ", basename(file)
)
## drop MS1
d <- d[d$MsLevel == 2L, ]
d$Condition <- seq_len(nrow(d))
d$Mz <- .targetedMassListToMz(d$TargetedMassList)
d$File <- gsub(.topDownFileExtRx("csv"), "", basename(file))
d
}
#' Read fasta file.
#'
#' Stupid fasta reader, ignores comments, and keeps just the first sequence.
#'
#' @param file character, filename
#' @param verbose logical, verbose output?
#' @return character
#' @noRd
.readFasta <- function(file, verbose=interactive()) {
aa <- readAAStringSet(file, nrec=1L, use.names=FALSE)[[1L]]
.msg(verbose, "Reading sequence from fasta file ", basename(file))
if (!length(aa)) {
stop("No sequence found.")
}
aa
}
#' Read ScanHeadMans header (txt) output.
#'
#' 1 row per scan (could have more rows than experiments.csv, because multiple
#' scans per condition are possible).
#'
#' @param file `character`, filename
#' @param conditions `character`/`numeric`, how to create condition IDs.
#' Conditions could be read from `"ScanDescription"`, calculated on
#' `"FilterString"` or by a given number of conditions.
#' @param verbose `logical`, verbose output?
#' @return data.frame
#' @noRd
.readScanHeadsTable <- function(file, conditions="ScanDescription",
verbose=interactive()) {
if (.fileExt(file) != "txt")
stop("'file' has to be a txt file with the extension '.txt'")
d <- read.csv(
file,
na.strings=c("NA", "N/A"), stringsAsFactors=FALSE, strip.white=TRUE
)
colnames(d) <- .camelCase(colnames(d))
.msg(verbose,
"Reading ", nrow(d), " header information from file ", basename(file)
)
## drop MS1
d <- d[d$MsOrder == 2L, ]
if (is.character(conditions)) {
## maybe more will follow
conditions <- match.arg(conditions, c("ScanDescription", "FilterString"))
if (conditions == "FilterString") {
## Somehow the FilterString doesn't always contains the right mass
## label and we have to correct them.
## See also:
## - https://github.com/sgibb/topdownr/issues/25
fixedFilterStrings <- .fixFilterString(d$FilterString)
if (nFixed <- sum(fixedFilterStrings != d$FilterString)) {
warning(
nFixed, " FilterString entries modified because of ",
"duplicated ID for different conditions.",
immediate.=verbose
)
}
d$FilterString <- fixedFilterStrings
d$Condition <- as.integer(.filterStringToId(d$FilterString))
## Sometimes skips happen and the ID is not the same as in the
## FilterString (happens often in the missing_scans files)
## See also:
## - https://github.com/sgibb/topdownr/issues/14
if (is.unsorted(d$Condition)) {
warning(
"ID in FilterString are not sorted in ascending order. ",
"Introduce own condition ID via 'cumsum'.",
immediate.=verbose
)
d$Condition <- cumsum(
c(TRUE, d$FilterString[-1L] != d$FilterString[-nrow(d)])
)
}
} else if (conditions == "ScanDescription") {
d$Condition <- match(d$ScanDescription, unique(d$ScanDescription))
}
} else if (is.numeric(conditions) && length(conditions) == 1L) {
d$Condition <- rep_len(seq_len(conditions), nrow(d))
} else {
stop("'conditions' has to be a 'character' or 'numeric' of length one.")
}
activation <- c("ETD", "CID", "HCD", "UVPD")
activationColumns <- .camelCase(paste0(activation, "Activation"))
d[, activationColumns] <- NA_real_
for (i in seq(along=activation)) {
## first activation
sel <- d$Activation1 == activation[i]
if (sum(sel)) {
d[sel, activationColumns[i]] <- d$Energy1[sel]
}
## second activation
if (activation[i] %in% c("CID", "HCD") &&
"Activation2" %in% names(d)) {
sel <- d$Activation2 == activation[i]
sel[is.na(sel)] <- FALSE
if (sum(sel)) {
d[sel, activationColumns[i]] <- d$Energy2[sel]
}
}
}
d$Activation <- .fragmentationMethod(d[, activationColumns])
d$File <- gsub(.topDownFileExtRx("txt"), "", basename(file))
d
}
#' Translate scan headsman rt to mzml scan numbers
#'
#' @param file character, filename
#' @param rt numeric
#' @param verbose logical, verbose output?
#' @return numeric
#' @noRd
.rtToScanId <- function(file, rt, verbose=interactive()) {
.msg(verbose, "Reading scan information from file ", basename(file))
fh <- openMSfile(file)
on.exit(close(fh))
hd <- header(fh)
## TODO: replace by MsCoreUtils::closest, because it is maintained and this
## function name is misleading
i <- .matchFragments(
rt * 60, hd$retentionTime, tolerance=1e3,
redundantIonMatch="closest", redundantFragmentMatch="closest"
)
.translateThermoIdToScanId(hd$spectrumId[i])
}
#' Read MS2 Spectra (mzML)
#'
#' @param file character, filename
#' @param fmass double, fragment mass
#' @param \ldots further arguments passed to .matchFragments
#' @param verbose logical, verbose output?
#' @return list (with data.frame for header and sparseMatrix with intensity
#' values)
#' @noRd
.readMzMl <- function(file, scans, fmass, ..., verbose=interactive()) {
.msg(verbose,
"Reading spectra information from file ", basename(file),
appendLF=FALSE
)
fh <- openMSfile(file)
on.exit(close(fh))
hd <- header(fh)
hd$Scan <- .translateThermoIdToScanId(hd$spectrumId)
i <- which(hd$msLevel == 2L & hd$Scan %in% scans)
hd <- hd[i, , drop=FALSE]
hd$acquisitionNum <- NULL
colnames(hd)[grepl("seqNum", colnames(hd), fixed=TRUE)] <- "SpectrumIndex"
hd$File <- gsub(.topDownFileExtRx("mzml"), "", basename(file))
colnames(hd) <- .camelCase(colnames(hd))
## depending on the used processing software the header column totIonCurrent
## doesn't take deisotoping and charge state reduction into account; so we
## calculate TIC for our own here
hd$TotIonCurrent <- NA_real_
nr <- nrow(hd)
m <- Matrix(0L, nrow=length(fmass), ncol=nr, sparse=TRUE)
for (j in seq_along(i)) {
pks <- peaks(fh, i[j])
k <- .matchFragments(pks[, 1L], fmass, ...)
notNA <- !is.na(k)
if (sum(notNA)) {
m[k[notNA], j] <- pks[notNA, 2L]
hd[j, "TotIonCurrent"] <- sum(pks[, 2L])
}
}
.msg(verbose,
sprintf(" (%02.1f%%)", round(nnzero(m) / sum(hd$PeaksCount) * 100, 1L))
)
list(hd=hd, m=m)
}
#' Read single (complete) MS2 Spectrum (mzML)
#'
#' @param file `character`, filename
#' @param i `integer`, scan idx (in file)
#' @return `matrix` (`mzR::peaks`)
#' @noRd
.readSpectrum <- function(file, i) {
fh <- openMSfile(file)
on.exit(close(fh))
if (i <= 0L || i > runInfo(fh)$scanCount) {
stop("Invalid spectrum index. It has to be 1:",
runInfo(fh)$scanCount, ".")
}
peaks(fh, i)
}
#' Merge ScanCondition and HeaderInformation
#'
#' @param sc data.frame, scan conditions
#' @param hi data.frame, header information
#' @return data.frame
#' @noRd
.mergeScanConditionAndHeaderInformation <- function(sc, hi) {
if (!is(sc, "data.frame") || !is(hi, "data.frame"))
stop("'sc' and 'hi' have to be of class 'data.frame'.")
d <- merge(
sc, hi,
by=c("File", "Condition"),
suffixes=c(".ScanCondition", ".HeaderInformation")
)
naSACe <- is.na(d$SupplementalActivationCe)
naCid <- is.na(d$CidActivation)
naHcd <- is.na(d$HcdActivation)
if (any(d$SupplementalActivationCe[!naSACe] != d$Energy2[!naSACe]) ||
any(d$SupplementalActivationCe[!(naSACe | naCid)] !=
d$CidActivation[!(naSACe | naCid)]) ||
any(d$SupplementalActivationCe[!(naSACe | naHcd)] !=
d$HcdActivation[!(naSACe | naHcd)])) {
stop("Merging of header and method information failed. ",
"Differences in 'SupplementalActivationCe', 'Energy2', ",
"'CidActivation' and/or 'HcdActivation' found.")
}
d
}
#' Merge spectra and ScanConditions/HeaderInformation (into featureData slot)
#'
#' @param mzml data.frame, header from mzML files
#' @param scdm data.frame, header from ScanHeadsman
#' @return merged data.frame
#' @noRd
.mergeSpectraAndHeaderInformation <- function(mzml, scdm) {
d <- merge(
mzml, scdm, sort=FALSE,
by=c("File", "Scan"),
suffixes=c(".SpectraInformation", ".HeaderInformation")
)
if (!.isEqual(d$RetentionTime, d$RtMin * 60, tolerance=1e-3)) {
stop("The retention times from header and spectra files differ.")
}
d
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.