Nothing
#########################################################################/**
# @RdocFunction readCcg
#
# @title "Reads an Affymetrix Command Console Generic (CCG) Data file"
#
# @synopsis
#
# \description{
# @get "title". The CCG data file format is also known as the
# Calvin file format.
# }
#
# \arguments{
# \item{pathname}{The pathname of the CCG file.}
# \item{verbose}{An @integer specifying the verbose level. If 0, the
# file is parsed quietly. The higher numbers, the more details.}
# \item{.filter}{A @list.}
# \item{...}{Not used.}
# }
#
# \value{
# A named @list structure consisting of ...
# }
#
# \details{
# Note, the current implementation of this methods does not utilize the
# Affymetrix Fusion SDK library. Instead, it is implemented in R from the
# file format definition [1].
# }
#
# \section{About the CCG file format}{
# A CCG file, consists of a "file header", a "generic data header",
# and "data" section, as outlined here:
# \itemize{
# \item File Header
# \item Generic Data Header (for the file)
# \enumerate{
# \item Generic Data Header (for the files 1st parent)
# \enumerate{
# \item Generic Data Header (for the files 1st parents 1st parent)
# \item Generic Data Header (for the files 1st parents 2nd parent)
# \item ...
# \item Generic Data Header (for the files 1st parents Mth parent)
# }
# \item Generic Data Header (for the files 2nd parent)
# \item ...
# \item Generic Data Header (for the files Nth parent)
# }
# \item Data
# \enumerate{
# \item Data Group \#1
# \enumerate{
# \item Data Set \#1
# \itemize{
# \item Parameters
# \item Column definitions
# \item Matrix of data
# }
# \item Data Set \#2
# \item ...
# \item Data Set \#L
# }
# \item Data Group \#2
# \item ...
# \item Data Group \#K
# }
# }
# }
#
# @author "HB"
#
# \seealso{
# @see "readCcgHeader".
# @see "readCdfUnits".
# }
#
# \references{
# [1] Affymetrix Inc, Affymetrix GCOS 1.x compatible file formats,
# April, 2006.
# \url{http://www.affymetrix.com/support/developer/}\cr
# }
#
# @keyword "file"
# @keyword "IO"
#*/#########################################################################
readCcg <- function(pathname, verbose=0, .filter=NULL, ...) {
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
# Validate arguments
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
# Argument '.filter':
hasFilter <- FALSE;
if (!is.null(.filter)) {
if (!is.list(.filter)) {
stop("Argument '.filter' must be a list: ", mode(.filter));
}
hasFilter <- TRUE;
}
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
# Open file
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
con <- file(pathname, open="rb");
on.exit(close(con));
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
# Allocate return structure
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
ccg <- list();
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
# Read file header
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
fhdr <- .readCcgFileHeader(con);
if (hasFilter) {
if (!identical(.filter$header, FALSE))
ccg$fileHeader <- fhdr;
} else {
ccg$fileHeader <- fhdr;
}
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
# Read the data header
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
ccg$genericDataHeader <- .readCcgDataHeader(con, .filter=.filter$dataHeader);
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
# Read the data
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
dataGroups <- .readCcgDataGroups(con, .filter=.filter$data, .fileHeader=fhdr);
if (hasFilter) {
if (!identical(.filter$dataGroups, FALSE))
ccg$dataGroups <- dataGroups;
} else {
ccg$dataGroups <- dataGroups;
}
ccg;
} # readCcg()
.readCcgDataGroups <- function(pathname, .filter=NULL, .fileHeader=NULL, ...) {
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
# Validate arguments
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
# Argument 'pathname':
if (inherits(pathname, "connection")) {
con <- pathname;
} else {
if (!file.exists(pathname))
stop("File not found: ", pathname);
con <- file(pathname, open="rb");
on.exit(close(con));
}
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
# Read file header?
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
if (is.null(.fileHeader)) {
.fileHeader <- .readCcgFileHeader(con);
}
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
# Read data groups
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
currFilter <- .filter;
nextDataGroupStart <- .fileHeader$dataGroupStart;
dataGroups <- list();
for (gg in seq_len(.fileHeader$nbrOfDataGroups)) {
dataGroupHeader <- .readCcgDataGroupHeader(con,
fileOffset=nextDataGroupStart);
# Next data group
nextDataGroupStart <- dataGroupHeader$nextGroupStart;
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
# Apply filter
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
# if (!is.null(.filter)) {
# currFilter <- NULL;
# if (is.null(names(.filter))) {
# currFilter <- .filter[[gg]];
# } else {
# pos <- match(dataGroupHeader$name, names(.filter));
# if (length(pos) > 0)
# currFilter <- .filter[[pos]];
# }
# }
# str(currFilter);
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
# Read data sets
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
offset <- dataGroupHeader$dataSetStart;
dss <- vector("list", dataGroupHeader$nbrOfDataSets);
names <- character(dataGroupHeader$nbrOfDataSets);
for (kk in seq_along(dss)) {
ds <- .readCcgDataSet(con, fileOffset=offset);
offset <- ds$nextDataSetStart;
dss[[kk]] <- ds;
names[kk] <- ds$name;
};
names(dss) <- names;
dataGroup <- list(
header = dataGroupHeader,
dataSets = dss
);
dataGroups <- c(dataGroups, list(dataGroup));
} # while (nextDataGroupStart != 0)
names(dataGroups) <- unlist(lapply(dataGroups, FUN=function(dg) {
dg$header$name
}), use.names=FALSE);
dataGroups;
} # .readCcgDataGroups()
.readCcgDataGroupHeader <- function(con, fileOffset=NULL, ...) {
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
# Local functions
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
rawToString <- function(raw, ...) {
# This approach drops all '\0', in order to avoid warnings
# in rawToChar(). Note, it does not truncate the string after
# the first '\0'. However, such strings should never occur in
# the first place.
raw <- raw[raw != as.raw(0)];
rawToChar(raw);
}
readInt <- function(con, n=1, ...) {
readBin(con, what=integer(), size=4, signed=TRUE, endian="big", n=n);
}
readUInt <- function(con, n=1, ...) {
# NOTE: Ideally we would use signed=FALSE here, but there is no
# integer data type in R that can hold 4-byte unsigned integers.
# Because of this limitation, readBin() will give a warning that
# signed=FALSE only works for size=1 or 2.
# WORKAROUND: Use signed=TRUE and assume there are no values
# greater that .Machine$integer.max == 2^31-1. /HB 2015-04-15
readBin(con, what=integer(), size=4, signed=TRUE, endian="big", n=n);
}
readWString <- function(con, ...) {
nchars <- readInt(con);
if (nchars == 0)
return("");
bfr <- readBin(con, what=raw(), n=2*nchars);
bfr <- bfr[seq(from=2, to=length(bfr), by=2)];
rawToString(bfr);
}
readRaw <- function(con, ...) {
n <- readInt(con);
if (n == 0)
return(raw(0));
readBin(con, what=raw(0), n=n);
}
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
#
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
if (!is.null(fileOffset)) {
seek(con=con, where=fileOffset, offset="start", rw="read");
}
# Data Group
# This section describes the data group. A data group is a group
# of data sets. The file supports one or more data groups in a file.
#
# Item Description Type
# 1 File position of the next data group. When this is the last
# data group in the file, the value should be 0. UINT
# 2 File position of the first data set within the data group. UINT
# 3 The number of data sets within the data group. INT
# 4 The data group name. WSTRING
nextGroupStart=readUInt(con)
dataSetStart=readUInt(con)
nbrOfDataSets=readInt(con)
name=readWString(con)
dataGroupHeader <- list(
nextGroupStart=nextGroupStart,
dataSetStart=dataSetStart,
nbrOfDataSets=nbrOfDataSets,
name=name
)
dataGroupHeader;
} # .readCcgDataGroupHeader()
.readCcgDataSet <- function(con, fileOffset=NULL, ...) {
# Value Types
# The following table defines the numeric values for the value types.
# The value type is used to representing the type of value stored in
# the file.
#
# Value Type
# 0 BYTE
# 1 UBYTE
# 2 SHORT
# 3 USHORT
# 4 INT
# 5 UINT
# 6 FLOAT
# 7 STRING
# 8 WSTRING
whats <- c("integer", "integer", "integer", "integer", "integer",
"integer", "double", "character", "character");
names(whats) <- c("BYTE", "UBYTE", "SHORT", "USHORT", "INT", "UINT", "FLOAT", "STRING", "WSTRING");
signeds <- c(TRUE, FALSE, TRUE, FALSE, TRUE, FALSE, TRUE, TRUE, TRUE);
sizes <- c(1, 1, 2, 2, 4, 4, 4, 1, 2);
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
# Local functions
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
rawToString <- function(raw, ...) {
# This approach drops all '\0', in order to avoid warnings
# in rawToChar(). Note, it does not truncate the string after
# the first '\0'. However, such strings should never occur in
# the first place.
raw <- raw[raw != as.raw(0)];
rawToChar(raw);
}
readByte <- function(con, n=1, ...) {
readBin(con, what=integer(), size=1, signed=TRUE, endian="big", n=n);
}
readInt <- function(con, n=1, ...) {
readBin(con, what=integer(), size=4, signed=TRUE, endian="big", n=n);
}
readUInt <- function(con, n=1, ...) {
# NOTE: Ideally we would use signed=FALSE here, but there is no
# integer data type in R that can hold 4-byte unsigned integers.
# Because of this limitation, readBin() will give a warning that
# signed=FALSE only works for size=1 or 2.
# WORKAROUND: Use signed=TRUE and assume there are no values
# greater that .Machine$integer.max == 2^31-1. /HB 2015-04-15
readBin(con, what=integer(), size=4, signed=TRUE, endian="big", n=n);
}
readString <- function(con, ...) {
nchars <- readInt(con);
if (nchars == 0)
return("");
readChar(con, nchars=nchars);
}
readWString <- function(con, ...) {
nchars <- readInt(con);
if (nchars == 0)
return("");
bfr <- readBin(con, what=raw(), n=2*nchars);
bfr <- bfr[seq(from=2, to=length(bfr), by=2)];
rawToString(bfr);
}
readRaw <- function(con, ...) {
n <- readInt(con);
if (n == 0)
return(raw(0));
readBin(con, what=raw(0), n=n);
}
readWVT <- function(con, ...) {
name <- readWString(con);
raw <- readRaw(con);
type <- readWString(con);
# Update data types
# * text/x-calvin-integer-8
# * text/x-calvin-unsigned-integer-8
# * text/x-calvin-integer-16
# * text/x-calvin-unsigned-integer-16
# * text/x-calvin-integer-32
# * text/x-calvin-unsigned-integer-32
# * text/x-calvin-float
# * text/plain
n <- length(raw);
# cat(sprintf("Reading n=%d records of type '%s' named '%s'.\n", n, type, name));
value <- switch(type,
"text/ascii" = {
rawToString(raw);
},
"text/plain" = {
# Unicode/UTF-16?!?
raw <- matrix(raw, ncol=2, byrow=TRUE);
raw <- raw[,2];
rawToString(raw);
},
"text/x-calvin-integer-8" = {
readBin(raw, what=integer(0), endian="big", size=1, signed=TRUE, n=n);
},
"text/x-calvin-unsigned-integer-8" = {
readBin(raw, what=integer(0), endian="big", size=1, signed=FALSE, n=n);
},
"text/x-calvin-integer-16" = {
readBin(raw, what=integer(0), endian="big", size=2, signed=TRUE, n=n);
},
"text/x-calvin-unsigned-integer-16" = {
readBin(raw, what=integer(0), endian="big", size=2, signed=FALSE, n=n);
},
"text/x-calvin-integer-32" = {
readBin(raw, what=integer(0), endian="big", size=4, signed=TRUE, n=n);
},
"text/x-calvin-unsigned-integer-32" = {
# NOTE: Ideally we would use signed=FALSE here, but there is no
# integer data type in R that can hold 4-byte unsigned integers.
# Because of this limitation, readBin() will give a warning that
# signed=FALSE only works for size=1 or 2.
# WORKAROUND: Use signed=TRUE and assume there are no values
# greater that .Machine$integer.max == 2^31-1. /HB 2015-04-15
readBin(raw, what=integer(0), endian="big", size=4, signed=TRUE, n=n);
},
"text/x-calvin-float" = {
readBin(raw, what=double(0), endian="big", size=4, n=n);
},
{
raw;
}
) # switch()
list(name=name, value=value, raw=raw, type=type);
} # readWVT()
readWBI <- function(con, ...) {
list(name=readWString(con), type=readByte(con), size=readInt(con));
}
if (!is.null(fileOffset)) {
seek(con=con, where=fileOffset, offset="start", rw="read");
}
# Data Set
# This section describes the data for a single data set item
# (probe set, sequence, allele, etc.). The file supports one
# or more data sets within a data group.
#
# Item Description Type
# 1 The file position of the first data element in the data set.
# This is the first byte after the data set header. UINT
# 2 The file position of the next data set within the data group.
# When this is the last data set in the data group the value
# shall be 1 byte past the end of the data set. This way the size
# of the data set may be determined. UINT
# 3 The data set name. WSTRING
# 4 The number of name/value/type parameters. INT
# 5 Array of name/value/type parameters. (WSTRING / VALUE / TYPE) [ ]
# 6 Number of columns in the data set.
# Example: For expression arrays, columns may include signal, p-value,
# detection call and for genotyping arrays columns may include allele
# call, and confidence value. For universal arrays, columns may
# include probe set intensities and background. UINT
# 7 An array of column names, column value types and column type sizes
# (one per column).
# The value type shall be represented by the value from the value type
# table. The size shall be the size of the type in bytes. For strings,
# this value shall be the size of the string in bytes plus 4 bytes for
# the string length written before the string in the file.
# (WSTRING / BYTE / INT) [ ]
# 8 The number of rows in the data set. UINT
# 9 The data set table, consisting of rows of columns (data values).
# The specific type and size of each column is described by the data
# and size types above. ROW [ ]
dataSet <- list(
elementsStart=readUInt(con),
nextDataSetStart=readUInt(con),
name=readWString(con)
)
# Reading parameters
nbrOfParams <- readInt(con);
params <- vector("list", nbrOfParams);
names <- character(nbrOfParams);
for (kk in seq_len(nbrOfParams)) {
wvt <- readWVT(con);
names[kk] <- wvt$name;
value <- wvt$value;
attr(value, "mimeType") <- wvt$type;
params[[kk]] <- value;
}
names(params) <- names;
dataSet$parameters <- params;
# Reading columns
nbrOfColumns <- readUInt(con);
columns <- vector("list", nbrOfColumns);
names <- character(nbrOfColumns);
colWhats <- vector("list", nbrOfColumns);
bytesPerRow <- 0;
for (cc in seq_len(nbrOfColumns)) {
wbi <- readWBI(con);
names[cc] <- wbi$name;
what <- whats[wbi$type+1];
signed <- signeds[wbi$type+1];
size <- wbi$size;
bytesPerRow <- bytesPerRow + size;
attr(what, "name") <- names(whats)[wbi$type+1];
attr(what, "signed") <- signed;
attr(what, "size") <- size;
colWhats[[cc]] <- what;
}
names(colWhats) <- names;
bytesPerRow <- as.integer(bytesPerRow);
nbrOfRows <- readUInt(con);
totalNbrOfBytes <- nbrOfRows * bytesPerRow;
# Skip to the first element
seek(con, which=dataSet$elementsStart, offset="start", rw="read");
# Read all data row by row
raw <- readBin(con, what=raw(), n=totalNbrOfBytes);
dim(raw) <- c(bytesPerRow, nbrOfRows);
table <- vector("list", nbrOfColumns);
colsOffset <- 0;
for (cc in seq_len(nbrOfColumns)) {
what <- colWhats[[cc]];
signed <- attr(what, "signed");
size <- attr(what, "size");
if (what == "character") {
value <- matrix(raw[1:4,], nrow=nbrOfRows, ncol=4);
raw <- raw[-c(1:4),,drop=FALSE];
# Get the number of characters per string (all equal)
## nchars <- readInt(con=value, n=nbrOfRows);
## nchars <- nchars[1];
nchars <- readInt(con=value, n=1);
value <- NULL; # Not needed anymore
ccs <- 1:(size-4);
value <- raw[ccs,];
raw <- raw[-ccs,,drop=FALSE];
value <- rawToChar(value, multiple=TRUE);
dim(value) <- c(nchars, nbrOfRows);
# Build strings using vectorization (not apply()!)
strs <- NULL;
for (pp in seq_len(nrow(value))) {
valuePP <- value[1,,drop=FALSE];
value <- value[-1,,drop=FALSE];
if (pp == 1) {
strs <- valuePP;
} else {
strs <- paste(strs, valuePP, sep="");
}
valuePP <- NULL; # Not needed anymore
}
value <- strs;
strs <- NULL; # Not needed anymore
} else {
ccs <- 1:size;
value <- raw[ccs,,drop=FALSE];
raw <- raw[-ccs,,drop=FALSE];
value <- readBin(con=value, what=what, size=size, signed=signed, endian="big", n=nbrOfRows);
}
table[[cc]] <- value;
colsOffset <- colsOffset + size;
} # for (cc ...)
# Turn into a data frame
attr(table, "row.names") <- .set_row_names(length(table[[1]]));
attr(table, "names") <- names;
class(table) <- "data.frame";
dataSet$table <- table;
dataSet;
} # .readCcgDataSet()
############################################################################
# HISTORY:
# 2012-05-18
# o Now using stop() instead of throw().
# 2011-11-01
# o CLEANUP: Changed signed=FALSE to signed=TRUE for readBin() calls
# reading 4-byte integers in internal .readCcgDataGroupHeader() and
# .readCcgDataSet().
# 2009-02-10
# o Added internal rawToString() replacing rawToChar() to avoid warnings
# on "truncating string with embedded nul".
# 2008-08-23
# o SPEED UP: Removed all gc() calls.
# 2008-01-13
# o Removed dependency on intToChar() in R.utils.
# o BUG FIX/UPDATE: The file format was updated between April 2006 and
# November 2007. More specifically, the so called "Value Types" were
# changed/corrected. Before values 7:9 were 'DOUBLE', 'STRING', and
# 'WSTRING'. Now 7:8 are 'STRING' and 'WSTRING' and there is no longer
# a 'DOUBLE'.
# This was detected while trying to read a CNCHP file outputted by the
# new Affymetrix Genotyping Console 2.0. We can now read these files.
# 2007-08-16
# o Now it is only readCcg() and readCcgHeader() that are public. The
# other readCcgNnn() functions are renamed to .readCcgNnn().
# o Now the read data is converted according to the mime type. See internal
# readWVT() function. The code is still ad hoc, so it is not generic.
# For instance, it basically assumes that Unicode strings only contain
# ASCII/ASCII-8 characters.
# 2006-11-06
# o Tested on Test3-1-121502.calvin.CEL and Test3-1-121502.calvin.CDF.
# o Created.
############################################################################
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.