View source: R/readCdfUnitsWriteMap.R
readCdfUnitsWriteMap | R Documentation |
Generates an Affymetrix cell-index write map from a CDF file.
The purpose of this method is to provide a re-ordering of cell elements such that cells in units (probesets) can be stored in contiguous blocks. When reading cell elements unit by unit, minimal file re-position is required resulting in a faster reading.
Note: At the moment does this package not provide methods to
write/reorder CEL files. In the meanwhile, you have to write
and re-read using your own file format. That's not too hard using
writeBin()
and readBin
().
readCdfUnitsWriteMap(filename, units=NULL, ..., verbose=FALSE)
filename |
The pathname of the CDF file. |
units |
An |
... |
Additional arguments passed to |
verbose |
Either a |
A integer
vector
which is a write map.
Henrik Bengtsson
To invert maps, see invertMap
().
readCel
() and readCelUnits
().
##############################################################
if (require("AffymetrixDataTestFiles")) { # START #
##############################################################
# Find any CDF file
cdfFile <- findCdf()
# Create a cell-index map (for writing)
writeMap <- readCdfUnitsWriteMap(cdfFile)
# Inverse map to be used to read cell elements such that, when read
# read unit by unit, they are read much faster.
readMap <- invertMap(writeMap)
# Validate the two maps
stopifnot(identical(readMap[writeMap], 1:length(readMap)))
cat("Summary of the \"randomness\" of the cell indices:\n")
moves <- diff(readMap) - 1
cat(sprintf("Number of unnecessary file re-positioning: %d (%.1f%%)\n",
sum(moves != 0), 100*sum(moves != 0)/length(moves)))
cat(sprintf("Extra positioning: %.1fGb\n", sum(abs(moves))/1024^3))
smallMoves <- moves[abs(moves) <= 25];
largeMoves <- moves[abs(moves) > 25];
layout(matrix(1:2))
main <- "Non-signed file moves required in unorded file"
hist(smallMoves, nclass=51, main=main, xlab="moves <=25 bytes")
hist(largeMoves, nclass=101, main="", xlab="moves >25 bytes")
# Clean up
layout(1)
rm(cdfFile, readMap, writeMap, moves, smallMoves, largeMoves, main)
##############################################################
} # STOP #
##############################################################
##############################################################
if (require("AffymetrixDataTestFiles")) { # START #
##############################################################
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
# Function to read Affymetrix probeset annotations
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
readAffymetrixProbesetAnnotation <- function(pathname, ...) {
# Get headers
header <- scan(pathname, what="character", sep=",", quote="\"",
quiet=TRUE, nlines=1);
# Read only a subset of columns (unique to this example)
cols <- c("Probe Set ID"="probeSet",
"Chromosome"="chromosome",
"Physical Position"="physicalPosition",
"dbSNP RS ID"="dbSnpId");
colClasses <- rep("NULL", length(header));
colClasses[header %in% names(cols)] <- "character";
# Read the data (this is what takes time)
df <- read.table(pathname, colClasses=colClasses, header=TRUE, sep=",",
quote="\"", na.strings="---", strip.white=TRUE, check.names=FALSE,
blank.lines.skip=FALSE, fill=FALSE, comment.char="", ...);
# Re-order columns
df <- df[,match(names(cols),colnames(df))];
colnames(df) <- cols;
# Use "Probe Set ID" as rownames. Note that if we use 'row.names=1'
# or similar something goes wrong. /HB 2006-03-06
rownames(df) <- df[[1]];
df <- df[,-1];
# Change types of columns
df[[1]] <- factor(df[[1]], levels=c(1:22,"X","Y",NA), ordered=TRUE);
df[[2]] <- as.integer(df[[2]]);
df;
} # readAffymetrixProbesetAnnotation()
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
# Main
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
for (zz in 1) {
# Chip to be remapped
chipType <- "Mapping50K_Xba240"
annoFile <- paste(chipType, "_annot.csv", sep="")
cdfFile <- findCdf(chipType)
if (is.null(cdfFile) || !file.exists(annoFile))
break;
# Read SNP location details
snpInfo <- readAffymetrixProbesetAnnotation(annoFile)
# Order by chromsome and then physical position
o <- order(snpInfo[[1]], snpInfo[[2]])
snpInfo <- snpInfo[o,]
rm(o)
# Read unit names in CDF file
unitNames <- readCdfUnitNames(cdfFile)
# The CDF unit indices sorted by chromsomal position
units <- match(rownames(snpInfo), unitNames)
# ...and cell indices in the same order
writeMap <- readCdfUnitsWriteMap(cdfFile, units=units)
# Inverse map to be used to write cell elements such that, if they
# later are read unit by unit, they are read in contiguous blocks.
readMap <- invertMap(writeMap)
# Clean up
rm(chipType, annoFile, cdfFile, snpInfo, unitNames, units, readMap, writeMap)
} # for (zz in 1)
##############################################################
} # STOP #
##############################################################
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.