affxparser: Affymetrix File Parsing SDK

Documented in readCcgHeader

#########################################################################/**
# @RdocFunction readCcgHeader
#
# @title "Reads an the header of an Affymetrix Command Console Generic (CCG) file"
#
# @synopsis
#
# \description{
#   @get "title".
# }
#
# \arguments{
#   \item{pathname}{The pathname of the CCG file.}
#   \item{verbose}{An @integer specifying the verbose level. If 0, the
#     file is parsed quietly.  The higher numbers, the more details.}
#   \item{.filter}{A @list.}
#   \item{...}{Not used.}
# }
#
# \value{
#   A named @list structure consisting of ...
# }
#
# @author "HB"
#
#  \details{
#    Note, the current implementation of this methods does not utilize the
#    Affymetrix Fusion SDK library.  Instead, it is implemented in R from the
#    file format definition [1].
#  }
#
#  \seealso{
#    @see "readCcg".
#  }
#
# \references{
#  [1] Affymetrix Inc, Affymetrix GCOS 1.x compatible file formats,
#      April, 2006.
#      \url{http://www.affymetrix.com/support/developer/}\cr
# }
#
# @keyword "file"
# @keyword "IO"
#*/#########################################################################
readCcgHeader <- function(pathname, verbose=0, .filter=list(fileHeader=TRUE, dataHeader=TRUE), ...) {
  # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
  # Validate arguments
  # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
  # Argument 'pathname':
  if (inherits(pathname, "connection")) {
    con <- pathname;
    pathname <- NA;
  } else {
    if (!file.exists(pathname))
      stop("File not found: ", pathname);
    con <- file(pathname, open="rb");
    on.exit(close(con));
  }

  # Argument '.filter':
  hasFilter <- FALSE;
  if (!is.null(.filter)) {
    if (!is.list(.filter)) {
      stop("Argument '.filter' must be a list: ", mode(.filter));
    }
    hasFilter <- TRUE;
  }

  header <- list(filename=pathname);

  # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
  # Read file header
  # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
  hdr <- .readCcgFileHeader(con);
  if (identical(.filter$fileHeader, TRUE) || is.list(.filter$fileHeader)) {
    header$fileHeader <- hdr;
  }

  # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
  # Read the data header
  # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
  hdr <- .readCcgDataHeader(con, .filter=.filter$dataHeader);
  if (identical(.filter$dataHeader, TRUE) || is.list(.filter$dataHeader)) {
    header$dataHeader <- hdr;
  }

  header;
} # readCcgHeader()



# File Header
# The file header section is the first section of the file. This
# section is used to identify the type of file (i.e. Command Console
# data file), its version number (for the file format) and the number
# of data groups stored within the file. Information about the contents
# of the file such as the data type identifier, the parameters used to
# create the file and its parentage is stored within the generic data
# header section.
#
# Item 	Description 	Type
# 1 Magic number. A value to identify that this is a Command Console
#    data file. The value will be fixed to 59. 	[UBYTE]
# 2 The version number of the file. This is the version of the file
#    format. It is currently fixed to 1. [UBYTE]
# 3 The number of data groups. [INT]
# 4 File position of the first data group. [UINT]
.readCcgFileHeader <- function(pathname, .filter=NULL, ...) {
  # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
  # Local functions
  # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
  readByte <- function(con, n=1, ...) {
    readBin(con, what=integer(), size=1, signed=TRUE, endian="big", n=n);
  }

  readUByte <- function(con, n=1, ...) {
    readBin(con, what=integer(), size=1, signed=FALSE, endian="big", n=n);
  }

  readInt <- function(con, n=1, ...) {
    readBin(con, what=integer(), size=4, signed=TRUE, endian="big", n=n);
  }

  readUInt <- function(con, n=1, ...) {
    # NOTE: Ideally we would use signed=FALSE here, but there is no
    # integer data type in R that can hold 4-byte unsigned integers.
    # Because of this limitation, readBin() will give a warning that
    # signed=FALSE only works for size=1 or 2.
    # WORKAROUND: Use signed=TRUE and assume there are no values
    # greater that .Machine$integer.max == 2^31-1. /HB 2015-04-15
    readBin(con, what=integer(), size=4, signed=TRUE, endian="big", n=n);
  }

  # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
  # Validate arguments
  # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
  # Argument 'pathname':
  if (inherits(pathname, "connection")) {
    con <- pathname;
  } else {
    if (!file.exists(pathname))
      stop("File not found: ", pathname);
    con <- file(pathname, open="rb");
    on.exit(close(con));
  }

  # Argument '.filter':
  hasFilter <- FALSE;
  if (!is.null(.filter)) {
    if (!is.list(.filter)) {
      stop("Argument '.filter' must be a list: ", mode(.filter));
    }
    hasFilter <- TRUE;
  }

  # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
  # Read
  # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
  magic <- readUByte(con);
  if (magic != 59)
    stop("File format error: Not a CCG file. Magic is not 59: ", magic);

  version <- readUByte(con);
  nbrOfDataGroups <- readInt(con);
  dataGroupStart <- readUInt(con);

  list(
    version = version,
    nbrOfDataGroups = nbrOfDataGroups,
    dataGroupStart = dataGroupStart
  )
} # .readCcgFileHeader()



.readCcgDataHeader <- function(con, .filter=NULL, ...) {
  # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
  # Local functions
  # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
  rawToString <- function(raw, ...) {
    # This approach drops all '\0', in order to avoid warnings
    # in rawToChar().  Note, it does not truncate the string after
    # the first '\0'.  However, such strings should never occur in
    # the first place.
    raw <- raw[raw != as.raw(0)];
    rawToChar(raw);
  }

  readByte <- function(con, n=1, ...) {
    readBin(con, what=integer(), size=1, signed=TRUE, endian="big", n=n);
  }

  readUByte <- function(con, n=1, ...) {
    readBin(con, what=integer(), size=1, signed=FALSE, endian="big", n=n);
  }

  readInt <- function(con, n=1, ...) {
    readBin(con, what=integer(), size=4, signed=TRUE, endian="big", n=n);
  }

  readUInt <- function(con, n=1, ...) {
    # NOTE: Ideally we would use signed=FALSE here, but there is no
    # integer data type in R that can hold 4-byte unsigned integers.
    # Because of this limitation, readBin() will give a warning that
    # signed=FALSE only works for size=1 or 2.
    # WORKAROUND: Use signed=TRUE and assume there are no values
    # greater that .Machine$integer.max == 2^31-1. /HB 2015-04-15
    readBin(con, what=integer(), size=4, signed=TRUE, endian="big", n=n);
  }

  readString <- function(con, ...) {
    nchars <- readInt(con);
    if (nchars == 0)
      return("");
    readChar(con, nchars=nchars);
  }

  readWString <- function(con, ...) {
    nchars <- readInt(con);
    if (nchars == 0)
      return("");
    raw <- readBin(con, what=raw(), n=2*nchars);
    raw <- raw[seq(from=2, to=length(raw), by=2)];
    rawToString(raw);
  }

  readRaw <- function(con, ...) {
    n <- readInt(con);
    if (n == 0)
      return(raw(0));
    readBin(con, what=raw(0), n=n);
  }

  readWVT <- function(con, ...) {
    name <- readWString(con);
    raw <- readRaw(con);
    type <- readWString(con);

    # Update data types
    # * text/x-calvin-integer-8
    # * text/x-calvin-unsigned-integer-8
    # * text/x-calvin-integer-16
    # * text/x-calvin-unsigned-integer-16
    # * text/x-calvin-integer-32
    # * text/x-calvin-unsigned-integer-32
    # * text/x-calvin-float
    # * text/plain
    n <- length(raw);
    value <- switch(type,
      "text/ascii" = {
        rawToString(raw);
      },

      "text/plain" = {
        # Unicode/UTF-16?!?
        raw <- matrix(raw, ncol=2, byrow=TRUE);
        raw <- raw[,2];
        rawToString(raw);
      },

      "text/x-calvin-integer-8" = {
        readBin(raw, what=integer(0), endian="big", size=1, signed=TRUE, n=n);
      },

      "text/x-calvin-unsigned-integer-8" = {
        readBin(raw, what=integer(0), endian="big", size=1, signed=FALSE, n=n);
      },

      "text/x-calvin-integer-16" = {
        readBin(raw, what=integer(0), endian="big", size=2, signed=TRUE, n=n/2);
      },

      "text/x-calvin-unsigned-integer-16" = {
        readBin(raw, what=integer(0), endian="big", size=2, signed=FALSE, n=n/2);
      },

      "text/x-calvin-integer-32" = {
        readBin(raw, what=integer(0), endian="big", size=4, signed=TRUE, n=n/4);
      },

      "text/x-calvin-unsigned-integer-32" = {
        # NOTE: Ideally we would use signed=FALSE here, but there is no
        # integer data type in R that can hold 4-byte unsigned integers.
        # Because of this limitation, readBin() will give a warning that
        # signed=FALSE only works for size=1 or 2.
        # WORKAROUND: Use signed=TRUE and assume there are no values
        # greater that .Machine$integer.max == 2^31-1. /HB 2015-04-15
        readBin(raw, what=integer(0), endian="big", size=4, signed=TRUE, n=n/4);
      },

      "text/x-calvin-float" = {
        readBin(raw, what=double(0), endian="big", size=4, n=n/4);
      },

      {
        raw;
      }
    ) # switch()

    list(name=name, value=value, raw=raw, type=type);
  } # readWVT()

  # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
  # Validate arguments
  # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
  hasFilter <- FALSE;
  if (!is.null(.filter)) {
    hasFilter <- TRUE;
  }

  # Nothing to do?
  if (hasFilter) {
    if (identical(.filter, FALSE) || length(.filter) == 0)
      return(NULL);
  }

  # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
  # Read
  # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
  hdr <- list(
    dataTypeId = readString(con),
    fileId = readString(con),
    timestamp = readWString(con),
    locale = readWString(con)
  )

  # Reading parameters
  nbrOfParams <- readInt(con);
  params <- vector("list", nbrOfParams);
  names <- character(nbrOfParams);
  for (kk in seq_len(nbrOfParams)) {
    wvt <- readWVT(con);
    names[kk] <- wvt$name;
    value <- wvt$value;
#    attr(value, "raw") <- wvt$raw;
    attr(value, "mimeType") <- wvt$type;
    params[[kk]] <- value;
  }
  names(params) <- names;
  hdr$parameters <- params;

  # Reading parent headers
  nbrOfParents <- readInt(con);
  parents <- vector("list", nbrOfParents);
  for (kk in seq_len(nbrOfParents)) {
    parents[[kk]] <- .readCcgDataHeader(con);
  }
  hdr$parents <- parents;

  hdr;
} # .readCcgDataHeader()




############################################################################
# HISTORY:
# 2012-05-18
# o Now using stop() instead of throw().
# 2011-11-01
# o CLEANUP: Changed signed=FALSE to signed=TRUE for readBin() calls
#   reading 4-byte integers in internal .readCcgFileHeader() and
#   .readCcgDataHeader().
# 2009-02-10
# o Added internal rawToString() replacing rawToChar() to avoid warnings
#   on "truncating string with embedded nul".
# 2007-08-16
# o Now the read data is converted according to the mime type.  See internal
#   readWVT() function.  The code is still ad hoc, so it is not generic.
#   For instance, it basically assumes that Unicode strings only contain
#   ASCII/ASCII-8 characters.
# 2006-11-06
# o Tested on Test3-1-121502.calvin.CEL and Test3-1-121502.calvin.CDF.
# o Created.
############################################################################