affxparser: Affymetrix File Parsing SDK

Documented in updateCel

#########################################################################/**
# @RdocFunction updateCel
#
# @title "Updates a CEL file"
#
# @synopsis
#
# \description{
#   @get "title".
# }
#
# \arguments{
#   \item{filename}{The filename of the CEL file.}
#   \item{indices}{A @numeric @vector of cell (probe) indices specifying
#     which cells to updated.  If @NULL, all indices are considered.}
#   \item{intensities}{A @numeric @vector of intensity values to be stored.
#     Alternatively, it can also be a named @data.frame or @matrix (or @list)
#     where the named columns (elements) are the fields to be updated.}
#   \item{stdvs}{A optional @numeric @vector.}
#   \item{pixels}{A optional @numeric @vector.}
#   \item{writeMap}{An optional write map.}
#   \item{...}{Not used.}
#   \item{verbose}{An @integer specifying how much verbose details are
#     outputted.}
# }
#
# \value{
#   Returns (invisibly) the pathname of the file updated.
# }
#
# \details{
#   Currently only binary (v4) CEL files are supported.
#   The current version of the method does not make use of the Fusion SDK,
#   but its own code to navigate and update the CEL file.
# }
#
# @examples "../incl/updateCel.Rex"
#
# @author "HB"
#
# @keyword "file"
# @keyword "IO"
#*/#########################################################################
updateCel <- function(filename, indices=NULL, intensities=NULL, stdvs=NULL, pixels=NULL, writeMap=NULL, ..., verbose=0) {
  # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
  # Validate arguments
  # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
  # Argument 'filename':
  if (!file.exists(filename)) {
    stop("Cannot update CEL file. File not found: ", filename);
  }

  header <- readCelHeader(filename);
  version <- header$version;
  if (!version %in% 4) {
    stop("Updating CEL v", version, " files is not supported: ", filename);
  }

  nbrOfCells <- header$total;

  # Argument 'indices':
  if (is.null(indices)) {
    nbrOfIndices <- nbrOfCells;
  } else {
    # A CEL file has one-based indices
    r <- range(indices);
    if (r[1] < 1 || r[2] > nbrOfCells) {
      stop("Argument 'indices' is out of range [1,", nbrOfCells, "]: ",
           "[", r[1], ",", r[2], "]");
    }
    nbrOfIndices <- length(indices);
  }

  # Argument 'intensities':
  if (is.matrix(intensities)) {
    intensities <- as.data.frame(intensities);
  }

  if (is.list(intensities) || is.data.frame(intensities)) {
    if (is.list(intensities)) {
      fields <- names(intensities);
    } else {
      fields <- colnames(intensities);
    }

    if (is.null(stdvs) && ("stdvs" %in% fields)) {
      stdvs <- intensities[["stdvs"]];
    }

    if (is.null(pixels) && ("pixels" %in% fields)) {
      pixels <- intensities[["pixels"]];
    }

    if ("intensities" %in% fields) {
      intensities <- intensities[["intensities"]];
    }
  }

  # Argument 'intensities':
  if (!is.null(intensities)) {
    if (!is.double(intensities))
      intensities <- as.double(intensities);
    if (length(intensities) != nbrOfIndices) {
      stop("Number of 'intensities' values does not match the number of cell indices: ", length(intensities), " != ", nbrOfIndices);
    }
  }

  # Argument 'stdvs':
  if (!is.null(stdvs)) {
    if (!is.double(stdvs))
      stdvs <- as.double(stdvs);
    if (length(stdvs) != nbrOfIndices) {
      stop("Number of 'stdvs' values does not match the number of cell indices: ", length(stdvs), " != ", nbrOfIndices);
    }
  }

  # Argument 'pixels':
  if (!is.null(pixels)) {
    if (!is.integer(pixels))
      pixels <- as.integer(pixels);
    if (length(pixels) != nbrOfIndices) {
      stop("Number of 'pixels' values does not match the number of cell indices: ", length(pixels), " != ", nbrOfIndices);
    }
  }

  # Argument 'writeMap':
  if (!is.null(writeMap)) {
    writeMap <- .assertMap(writeMap, nbrOfCells);
  }

  # Argument 'verbose':
  if (length(verbose) != 1)
    stop("Argument 'verbose' must be a single integer.");
  verbose <- as.integer(verbose);
  if (!is.finite(verbose))
    stop("Argument 'verbose' must be an integer: ", verbose);

  # Nothing to do?
  if (nbrOfIndices == 0) {
    return(invisible(filename));
  }

  # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
  # Reorder data such that it is written in optimal order
  # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
  reorder <- TRUE;
  if (is.null(indices)) {
    # Has write map?
    if (is.null(writeMap)) {
      indices <- 1:nbrOfIndices;
      reorder <- FALSE;
    } else {
      indices <- writeMap;
    }
  }

  if (reorder) {
    if (verbose >= 2)
      cat("Re-ordering data for optimal write order...");
    o <- order(indices);
    indices <- indices[o];
    if (!is.null(intensities))
      intensities <- intensities[o];
    if (!is.null(stdvs))
      stdvs <- stdvs[o];
    if (!is.null(pixels))
      pixels <- pixels[o];
    o <- NULL; # Not needed anymore
    if (verbose >= 2)
      cat("done.\n");
  }

  # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
  # Write data to file
  # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
  version <- header$version;
  if (version == 4) {
    # Open CEL file
    con <- file(filename, open="r+b");
    on.exit(close(con));

    # Skip CEL header
    if (verbose >= 2)
      cat("Skipping to beginging of data section...");
    h <- .readCelHeaderV4(con);

    # "Cell entries - this consists of an intensity value, standard
    #  deviation value and pixel count for each cell in the array.
    #  The values are stored by row then column starting with the X=0,
    #  Y=0 cell. As an example, the first five entries are for cells
    #  defined by XY coordinates: (0,0), (1,0), (2,0), (3,0), (4,0).
    #  Type: (float, float, short) = 4 + 4 + 2 = 10 bytes / cell
    #  cellData <- c(readFloat(con), readFloat(con), readShort(con));
    sizeOfCell <- 10;

    # Current file position
    dataOffset <- seek(con, origin="start", rw="read");

    if (verbose >= 2)
      cat("done.\n");

    # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
    # Update in chunks
    # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
    CHUNK.SIZE <- 2^19; # = 524288 indices == 5.2Mb
    CHUNK.SIZE <- 2^20; # = 1048576 indices == 10.5Mb

    # Work with zero-based indices
    indices <- indices - 1;

    count <- 1;
    offset <- dataOffset;
    nbrOfChunks <- ceiling(length(indices) / CHUNK.SIZE);
    while (length(indices) > 0) {
      if (verbose >= 1) {
        cat(sprintf("Number of indices left: %d\n", length(indices)));
        cat(sprintf("Updating chunk #%d of %d...\n", count, nbrOfChunks));
      }

      # Recall: All indices are ordered!

      # Shift offset to the first index.
      firstIndex <- indices[1];
      offset <- offset + sizeOfCell*firstIndex;

      # Shift indices such that first index is zero.
      indices <- indices - firstIndex;

      # Get largest index
      maxIndex <- indices[length(indices)];
#      verbose && cat(verbose, "Largest index: ", maxIndex, "\n");

      # Identify the indices to update such no more than CHUNK.SIZE cells
      # are read/updated.
      n <- which.max(indices >= CHUNK.SIZE);
      if (n == 1)
        n <- length(indices);

      subset <- 1:n;

      # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
      # Read the data section of the CEL file
      # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
      if (verbose >= 1)
        cat("Reading chunk data section...");

      seek(con, origin="start", where=offset, rw="read");
      rawAll <- readBin(con=con, what="raw", n=sizeOfCell*(indices[n]+1));
      if (verbose >= 1)
        cat("done.\n");
      # Common to all fields
      raw <- NULL;

      # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
      # Update 'intensities'
      # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
      if (!is.null(intensities)) {
        if (verbose >= 1)
          cat("Updating 'intensities'...");
        # Write floats (size=4) to raw vector
        raw <- raw(length=4*n);
        raw <- writeBin(con=raw, intensities[subset], size=4, endian="little");
        intensities <- intensities[-subset]; # Not needed anymore

        # Updated 'rawAll' accordingly
        idx <- rep(sizeOfCell*indices[subset], each=4) + 1:4;
        rawAll[idx] <- raw;
        idx <- NULL; # Not needed anymore
        if (verbose >= 1)
          cat("done.\n");
      }

      # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
      # Update 'stdvs'
      # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
      if (!is.null(stdvs)) {
        if (verbose >= 1)
          cat("Updating 'stdvs'...");
        # Write floats (size=4) to raw vector
        if (length(raw) != 4*n)
          raw <- raw(length=4*n);
        raw <- writeBin(con=raw, stdvs[subset], size=4, endian="little");
        stdvs <- stdvs[-subset]; # Not needed anymore

        # Updated 'rawAll' accordingly
        idx <- rep(sizeOfCell*indices[subset], each=4) + 5:8;
        rawAll[idx] <- raw;
        idx <- NULL; # Not needed anymore
        if (verbose >= 1)
          cat("done.\n");
      }

      # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
      # Update 'pixels'
      # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
      if (!is.null(pixels)) {
        raw <- NULL; # Not needed anymore
        if (verbose >= 1)
          cat("Updating 'pixels'...");
        # Write short integers (size=2) to raw vector
        raw <- raw(length=2*n);
        raw <- writeBin(con=raw, pixels[subset], size=2, endian="little");
        pixels <- pixels[-subset]; # Not needed anymore

        # Updated 'rawAll' accordingly
        idx <- rep(sizeOfCell*indices[subset], each=2) + 9:10;
        rawAll[idx] <- raw;
        idx <- NULL; # Not needed anymore
        if (verbose >= 1)
          cat("done.\n");
      }
      raw <- NULL; # Not needed anymore

      # Remove updated indices
      indices <- indices[-subset];
      subset <- NULL; # Not needed anymore

      # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
      # Write raw data back to file
      # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
      if (verbose >= 1)
        cat("Writing chunk to data section...");
      seek(con, origin="start", where=offset, rw="write");
      writeBin(con=con, rawAll);
      if (verbose >= 1)
        cat("done.\n");

      rawAll <- NULL; # Not needed anymore

      if (verbose >= 1)
        cat(sprintf("Updating chunk #%d of %d...done\n", count, nbrOfChunks));
      count <- count + 1;
    } # while (...)
  } # if (version ...)

  invisible(filename);
}


############################################################################
# HISTORY:
# 2007-01-04
# o Added argument 'writeMap'.
# 2006-08-19
# o BUG FIX: Wow wow wow.  This one was tricky to find.  If not specifying
#   the 'rw' argument in seek() it defaults to "", which is not "read" as
#   I naively though (because I did not read the inner details of ?seek),
#   but the latest call to seek.  In other words, since I at the end of
#   every "chunk" loop call seek(..., rw="write") the seek(..., [rw=""])
#   was equal to a seek(..., rw="write"), but I wanted seek(..., rw="read")!
#   That made updateCel() do funny things and write to the wrongs parts
#   of the file etc.
# 2006-08-18
# o BUG FIX: The new implementation where data was written to raw vectors
#   was incorrect.  A lot of extra zeros was written.  The "eastern egg" in
#   the updated example contains an image from http://tpo.berkeley.edu/.
# 2006-08-14
# o BUG FIX: updateCel() would in some cases give "Error: subscript out of
#   bounds" when writing the last chunk.
# 2006-07-22
# o Update updateCel() to update data in chunks, because updating the
#   complete data section is expensive.  For example, a 500K chip has
#   6553600 cells each of size 10 bytes, i.e. >65Mb or raw memory.  With
#   copying etc it costs >100-200Mb of memory to update a CEL file if only
#   the first *and* the last cell is updated.  Now it should only be of
#   the order of 10-20Mb per chunk.
# o Added verbose output to updateCel().
# o Now updateCel() deallocates objects as soon as possible in order to
#   free up as much memory as possible. Had memory problems with the 500K's.
# 2006-07-21
# o updateCel() was really slow when updating a large number of cells.
#   Now the idea is to write to raw vectors stored in memory.  By reading
#   the chunk of the CEL data section that is going to be updated as a raw
#   data vector and then updating this in memory first, and the re-write
#   that chuck of raw data to file, things are much faster.
# o BUG FIX: updateCel(..., indices=NULL) would generate an error, because
#   we tried to reorder by order(indices).
# 2006-06-19
# o Replace 'data' argument with arguments 'intensities', 'stdvs', and
#   'pixels'. /HB
# 2006-06-18
# o First version can update CEL v4 (binary) cell entries.  Note that this
#   code does not make use of the Fusion SDK library.  This may updated in
#   the future, but for now we just want something that works.
# o Created. /HB
############################################################################