###########################################################################/**
# @set "class=AffymetrixCdfFile"
# @RdocMethod createUniqueCdf
# @aliasmethod createUnique
#
# @title "Creates a unique-cell version of the CDF"
#
# \description{
# In some cases, single probes map to multiple genome locations. In cases
# where you may later want to store a probe estimate (e.g. a probe effect
# or a residual), you will not be able to store more than one per cell.
# The unique CDF facilitates this by making the cell indices unique
# (essentially copying the multimapping probes).
# }
#
# @synopsis
#
# \arguments{
# \item{chipType}{The chip type of the new CDF.}
# \item{tags}{Tags added to the chip type of the new CDF.}
# \item{path}{The path where to store the new CDF file.}
# \item{...}{Additional arguments passed to @see "affxparser::writeCdf".}
# \item{ram}{A @double saying if more or less units should be converted
# per chunk. A smaller value uses less memory.}
# \item{verbose}{A @logical or @see "R.utils::Verbose".}
# }
#
# \value{
# Returns the unique CDF as an @see "AffymetrixCdfFile" object.
# }
#
# \details{
# Note that the set of units in the "unique" CDF is identical to that
# of the input CDF. So are the unit groups in each unit.
# Also the number of cells per unit group is preserved.
# It is only the cell-index composition of each unit group that changes.
# The cell indices in the unique CDF are guaranteed to occur only
# once, whereas this may not be true for the input CDF.
# }
#
# @author "MR"
#
# \seealso{
# @seeclass
# }
#
# @keyword IO
#*/###########################################################################
setMethodS3("createUniqueCdf", "AffymetrixCdfFile", function(this, chipType=getChipType(this), tags="unique", path=NULL, units=NULL, ..., ram=NULL, verbose=TRUE) {
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
# Local functions
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
rearrangeCells <- function(units, offset=0, hasGroups=TRUE, ncols, ...) {
rearrangeGroup <- function(group, idxs, ...) {
y = (idxs-1) %/% ncols
x = (idxs-1) - ncols*y
group$y <- y
group$x <- x
group$indices <- idxs
group
} # rearrangeGroup()
nbrOfCells <- lapply(units, FUN=function(unit) .subset2(unit, "ncells"))
nbrOfCells <- sum(unlist(nbrOfCells, use.names=FALSE))
cells <- seq(from=offset+1, to=offset+nbrOfCells)
verbose && printf(verbose, "Units: ")
if (hasGroups) {
for (kk in seq_along(units)) {
if (verbose) {
if (kk %% 1000 == 0) {
printf(verbose, "%d, ", kk)
} else if (kk %% 100 == 0) {
cat(".")
}
}
# groups <- units[[kk]]$groups
groups <- .subset2(.subset2(units, kk), "groups")
for (ll in seq_along(groups)) {
group <- .subset2(groups, ll)
# Number of cells in this group
nindices <- length(.subset2(group, "indices"))
head <- 1:nindices
idxs <- .subset(cells, head)
cells <- .subset(cells, (nindices+1):length(cells))
groups[[ll]] <- rearrangeGroup(group, idxs)
}
units[[kk]]$groups <- groups
}
} else {
for (kk in seq_along(units)) {
if (verbose) {
if (kk %% 1000 == 0) {
printf(verbose, "%d, ", kk)
} else if (kk %% 100 == 0) {
cat(".")
}
}
group <- .subset2(units, kk)
# Number of cells in this group
nindices <- length(.subset2(group, "indices"))
head <- 1:nindices
idxs <- .subset(cells, head)
cells <- .subset(cells, (nindices+1):length(cells))
group <- rearrangeGroup(group, idxs)
units[[kk]] <- group
}
}
verbose && printf(verbose, "\n")
units
} # rearrangeCells()
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
# Validate arguments
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
# Argument 'ram':
ram <- getRam(aromaSettings, ram)
# Argument 'verbose':
verbose <- Arguments$getVerbose(verbose)
if (verbose) {
pushState(verbose)
on.exit(popState(verbose))
}
verbose2 <- as.integer(verbose)-1; # For 'affxparser' calls.
verbose2 <- 2
# Argument 'units':
if(is.null(units)) {
units <- seq_len(nbrOfUnits(this))
} else {
units <- Arguments$getIndices(units, max=nbrOfUnits(this))
}
# Already a unique CDF?
if (isUniqueCdf(this)) {
return(this)
}
verbose && enter(verbose, "Creating unique CDF")
verbose && cat(verbose, "Chip type: ", getChipType(this))
# Get the pathname of the source
src <- getPathname(this)
src <- Arguments$getReadablePathname(src)
# Create the pathname of the destination CDF
if (is.null(path)) {
mainChipType <- gsub("[,].*", "", chipType)
path <- filePath("annotationData", "chipTypes", mainChipType)
}
# Write to a temporary file first, and rename it when we know its complete
name <- paste(c(chipType, tags), collapse=",")
filename <- sprintf("%s.CDF", name)
pathname <- Arguments$getWritablePathname(filename, path=path)
pathname <- AffymetrixFile$renameToUpperCaseExt(pathname)
pathname <- Arguments$getWritablePathname(pathname, mustNotExist=TRUE)
pathnameT <- pushTemporaryFile(pathname, verbose=verbose)
# Assure source and destination is not identical
if (identical(src, pathnameT)) {
throw("Cannot not create CDF file. Destination is same as source: ", src)
}
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
# Figure out the number of (regular) unit cells in the output
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
# Get the number of groups per units
verbose && enter(verbose, "Reading CDF group names")
nbrOfGroupsPerUnit <- .readCdfGroupNames(src)
verbose && exit(verbose)
names(nbrOfGroupsPerUnit) <- NULL
# Garbage collect
gc <- gc()
verbose && print(verbose, gc)
# Get the number of groups per unit
nbrOfGroupsPerUnit <- sapply(nbrOfGroupsPerUnit, FUN=length)
# Garbage collect
gc <- gc()
verbose && print(verbose, gc)
# get a relatively low-memory version of the CDF indices
# Q: Why is readCdf(..., stratifyBy=<vector>) not throwing?
# Error in match.arg(stratifyBy) : 'arg' must be of length 1
# /HB 2011-04-15
cdfLite <- .readCdf(src, units=units,
readXY=FALSE, readBases=FALSE,
readIndexpos=FALSE, readAtoms=FALSE,
readUnitType=FALSE, readUnitDirection=FALSE,
readUnitNumber=FALSE, readUnitAtomNumbers=FALSE,
readGroupAtomNumbers=FALSE, readGroupDirection=FALSE,
readIndices=TRUE, readIsPm=FALSE,
stratifyBy=c("nothing", "pmmm", "pm", "mm"), verbose=0)
# Get the number of cells per unit
nbrOfCellsPerUnit <- sapply(cdfLite, FUN=function(unit) {
length(unlist(unit, use.names=FALSE))
})
verbose && cat(verbose, "Number of cells per unit:")
verbose && summary(verbose, nbrOfCellsPerUnit)
# Not needed anymore
cdfLite <- NULL; # Not needed anymore
# Total number of cells in CDF
nbrOfCells <- sum(nbrOfCellsPerUnit)
# Number of units in CDF
nbrOfUnits <- length(nbrOfCellsPerUnit)
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
# QC units
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
# Keep all QC units
verbose && enter(verbose, "Reading CDF QC units")
destQcUnits <- .readCdfQc(src)
verbose && exit(verbose)
nbrOfQcUnits <- length(destQcUnits)
nbrOfCellsPerQcUnit <- lapply(destQcUnits, FUN=.subset2, "ncells")
nbrOfCellsPerQcUnit <- unlist(nbrOfCellsPerQcUnit, use.names=FALSE)
nbrOfQcCells <- sum(nbrOfCellsPerQcUnit)
verbose && printf(verbose,
"Number of QC cells: %d in %d QC units (%s)\n",
nbrOfQcCells, nbrOfQcUnits,
hsize(object.size(destQcUnits), digits = 2L, standard = "IEC"))
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
# Chip layout
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
totalNbrOfCells <- nbrOfCells + nbrOfQcCells
verbose && printf(verbose, "Total number of cells: %d\n", totalNbrOfCells)
# Figure out a best fit square(!) layout of this number of cells
side <- as.integer(floor(sqrt(totalNbrOfCells)))
nrows <- ncols <- side
# If not big enough, increase the nbr of rows, then nbr of column
if (nrows*ncols < totalNbrOfCells) {
nrows <- as.integer(nrows + 1)
if (nrows*ncols < totalNbrOfCells) {
ncols <- as.integer(ncols + 1)
}
}
verbose && printf(verbose, "Best array dimension: %dx%d (=%d cells, i.e. %d left-over cells)\n", nrows, ncols, nrows*ncols, nrows*ncols - totalNbrOfCells)
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
# Create CDF header
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
verbose && enter(verbose, "Creating CDF header with source CDF as template")
verbose && enter(verbose, "Setting up header")
# Get template header
verbose && enter(verbose, "Reading CDF header")
destHeader <- .readCdfHeader(src)
verbose && exit(verbose)
# Update CDF header dimension
destHeader$nrows <- nrows
destHeader$ncols <- ncols
# Get unit names
verbose && enter(verbose, "Reading CDF unit names")
unitNames <- .readCdfUnitNames(src)
verbose && exit(verbose)
if (nbrOfUnits > 0) {
# Number of bytes per unit:
# = 20 + (18+64)*nbrOfGroupsPerUnit + 14*nbrOfCellsPerUnit bytes
unitLengths <- 20 + 82*nbrOfGroupsPerUnit + 14*nbrOfCellsPerUnit
avgUnitLength <- mean(unitLengths)
} else {
unitLengths <- NULL
}
# Number of bytes per QC unit:
if (nbrOfQcUnits > 0) {
# Start positions for QC units
# Number of bytes per QC unit:
# = 6 + 7*nbrOfCellsPerQcUnit bytes
qcUnitLengths <- 6 + 7*nbrOfCellsPerQcUnit
} else {
qcUnitLengths <- NULL
}
verbose && exit(verbose)
verbose && enter(verbose, "Writing")
verbose && cat(verbose, "destHeader:")
verbose && str(verbose, destHeader)
verbose && cat(verbose, "unitNames:")
verbose && str(verbose, unitNames)
verbose && cat(verbose, "qcUnitLengths:")
verbose && str(verbose, qcUnitLengths)
verbose && cat(verbose, "unitLengths:")
verbose && str(verbose, unitLengths)
# Garbage collect
gc <- gc()
verbose && print(verbose, gc)
# Open output connection
con <- file(pathnameT, open = "wb")
on.exit({
if (!is.null(con))
close(con)
con <- NULL
})
# Allocate/write CDF header
.writeCdfHeader(con=con, destHeader, unitNames=unitNames,
qcUnitLengths=qcUnitLengths, unitLengths=unitLengths,
verbose=verbose2)
# Not needed anymore
# Not needed anymore
destHeader <- unitNames <- qcUnitLengths <- unitLengths <- NULL
# Garbage collect
gc <- gc()
verbose && print(verbose, gc)
verbose && exit(verbose)
verbose && exit(verbose)
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
# Restructure QC units
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
verbose && enter(verbose, "Writing QC units")
verbose && enter(verbose, "Rearranging QC unit cell indices")
destQcUnits <- rearrangeCells(destQcUnits, offset=nbrOfCells, ncols=ncols,
hasGroups=FALSE, verbose=verbose)
# Garbage collect
gc <- gc()
verbose && exit(verbose)
# Write QC units
.writeCdfQcUnits(con=con, destQcUnits, verbose=verbose2)
# Not needed anymore
destQcUnits <- NULL; # Not needed anymore
# Garbage collect
gc <- gc()
verbose && print(verbose, gc)
verbose && exit(verbose)
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
# Units
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
# Create all new units in chunks
verbose && printf(verbose, "Number of units: %d\n", nbrOfUnits)
verbose && printf(verbose, "Argument 'ram': %f\n", ram)
verbose && printf(verbose, "Average unit length: %f bytes\n", avgUnitLength)
# Scale (not used)
if (nbrOfUnits > 0) {
scale <- 200/avgUnitLength
} else {
scale <- 1
}
# Number of units per chunk
nbrOfUnitsPerChunk <- as.integer(ram * scale * 20000)
nbrOfChunks <- ceiling(nbrOfUnits / nbrOfUnitsPerChunk)
verbose && printf(verbose, "Number of chunks: %d (%d units/chunk)\n",
nbrOfChunks, nbrOfUnitsPerChunk)
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
# Reading and extracting unit data
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
verbose && enter(verbose, "Reading, extracting, and writing units")
# fields <- c("x", "y", "indices", "pbase", "tbase", "atom", "indexpos")
fields <- c("pbase", "tbase", "atom", "indexpos")
head <- 1:nbrOfUnitsPerChunk
count <- 1
idxOffset <- as.integer(0)
unitsToDo <- 1:nbrOfUnits
while (length(unitsToDo) > 0) {
if (length(unitsToDo) < nbrOfUnitsPerChunk) {
head <- 1:length(unitsToDo)
}
units <- unitsToDo[head]
verbose && printf(verbose, "Chunk #%d of %d (%d units)\n",
count, nbrOfChunks, length(units))
verbose && cat(verbose, "Units:")
verbose && str(verbose, units)
unitsToDo <- unitsToDo[-head]
# Read CDF structure
verbose && enter(verbose, "Reading CDF list structure")
srcUnits <- .readCdf(src, units=units, readGroupDirection=TRUE)
verbose && exit(verbose)
# Sanity check
if (is.null(srcUnits)) {
throw(sprintf("Failed to read %d units from CDF file. This could be because you are running out of memory. Try decreasing argument 'ram': %s", length(units), src))
}
# Sanity check
if (length(srcUnits) != length(units)) {
throw("Number of read CDF units does not equal number of requested units: ", length(srcUnits), " != ", length(units))
}
if (verbose && isVisible(verbose)) {
printf(verbose, " => RAM: %s\n", hsize(object.size(srcUnits), digits = 2L, standard = "IEC"))
}
# Sanity check
if (length(srcUnits) == 0) {
throw("Internal error: While creating unique CDF, an empty list of CDF units was read.")
}
# Precalculate
srcUnits <- lapply(srcUnits, FUN=function(unit) {
groups <- .subset2(unit, "groups")
groups <- lapply(groups, FUN=function(group) {
nThisGroup <- length(.subset2(group,"pbase"))
#group[fields] <- lapply(.subset(group, fields), FUN=.subset, fIdxs)
#group$natoms <- nThisGroup
#group$ncellsperatom <- as.integer(1)
idxs <- idxOffset + seq_len(nThisGroup)
idxs1 <- as.integer(idxs-1); # Makes sure x & y are integers.
y <- idxs1 %/% ncols
group$y <- y
group$x <- idxs1 - ncols*y
idxOffset <<- idxOffset + nThisGroup
group
})
#ncells <- length(groups)*nbrOfCellsPerField
unit$groups <- groups
#unit$ncells <- ncells
#unit$natoms <- ncells
#unit$ncellsperatom <- as.integer(1)
unit
})
#return(srcUnits)
# Sanity check
if (length(srcUnits) == 0) {
throw("Internal error: While creating unique CDF, an empty list of CDF units is requested to be written.")
}
# verbose && str(verbose, srcUnits[1])
# Write regular units
.writeCdfUnits(con=con, srcUnits, verbose=verbose2)
# Not needed anymore
srcUnits <- units <- NULL; # Not needed anymore
count <- count + 1
} # while (length(unitsToDo) > 0)
# Not needed anymore
unitsToDo <- head <- fields <- count <- NULL
# Garbage collect
gc <- gc()
verbose && print(verbose, gc)
verbose && exit(verbose)
# Close the connection
close(con)
con <- NULL
verbose && cat(verbose, "Temporary CDF file created:")
verbose && cat(verbose, "Output pathname: ", pathnameT)
verbose && print(verbose, file.info(pathnameT))
# Garbage collect
gc <- gc()
verbose && print(verbose, gc)
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
# Verifying the CDF
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
verbose && enter(verbose, "Verifying the written CDF")
# Checking header
header <- .readCdfHeader(pathnameT)
# Validation array dimension
if ((header$nrows != nrows) || (header$ncols != ncols)) {
throw(sprintf("Failed to create a valid unique-cell CDF: The dimension of the written CDF does not match the intended one: (%d,%d) != (%d,%d)", header$nrows, header$ncols, nrows, ncols))
}
# Checking cell indices
# Check CDF file in chunks
nbrOfUnits <- header$probesets
chunkSize <- 10000*ram
nbrOfChunks <- ceiling(nbrOfUnits / chunkSize)
for (kk in 1:nbrOfChunks) {
verbose && printf(verbose, "Chunk %d of %d\n", kk, nbrOfChunks)
from <- (kk-1)*chunkSize+1
to <- min(from+chunkSize, nbrOfUnits)
cells <- .readCdfCellIndices(pathnameT, units=from:to)
cells <- unlist(cells, use.names=FALSE)
cells <- diff(cells)
cells <- unique(cells)
udcells <- as.integer(cells)
if (!identical(udcells, 1:1)) {
throw("Failed to create a valid unique-cell CDF: The cell indices are not contiguous: ", paste(udcells, collapse=", "))
}
# Not needed anymore
cells <- udcells <- NULL
}
verbose && exit(verbose)
# Garbage collect
gc <- gc()
verbose && print(verbose, gc)
# Rename temporary file
popTemporaryFile(pathnameT, verbose=verbose)
verbose && cat(verbose, "File pathname: ", pathname)
verbose && print(verbose, file.info(pathname))
verbose && exit(verbose)
verbose && exit(verbose)
# Return an AffymetrixCdfFile object for the new CDF
cdfU <- newInstance(this, pathname)
verbose && enter(verbose, "Final set of sanity checks")
verbose && cat(verbose, "Number of units")
stopifnot(nbrOfUnits(cdfU) == nbrOfUnits(this))
verbose && cat(verbose, "Number of groups per unit")
stopifnot(identical(nbrOfGroupsPerUnit(cdfU), nbrOfGroupsPerUnit(this)))
verbose && cat(verbose, "Groups names per unit")
stopifnot(identical(.readCdfGroupNames(getPathname(cdfU)), .readCdfGroupNames(getPathname(this))))
verbose && cat(verbose, "Number of cells per unit group")
stopifnot(identical(nbrOfCellsPerUnitGroup(cdfU), nbrOfCellsPerUnitGroup(this)))
verbose && cat(verbose, "Consecutive ordering of cell indices")
cells <- getCellIndices(cdfU, unlist=TRUE, useNames=FALSE)
stopifnot(length(cells) <= nbrOfCells(cdfU))
stopifnot(identical(unique(diff(cells)), 1L))
## Create checksum file
cdfUZ <- getChecksumFile(cdfU)
verbose && exit(verbose)
cdfU
}, private=TRUE) # createUniqueCdf()
setMethodS3("getUniqueCdf", "AffymetrixCdfFile", function(this, ..., verbose=FALSE) {
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
# Validate arguments
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
# Argument 'verbose':
verbose <- Arguments$getVerbose(verbose)
if (verbose) {
pushState(verbose)
on.exit(popState(verbose))
}
# Already a unique CDF?
if (isUniqueCdf(this)) {
return(this)
}
verbose && enter(verbose, "Retrieving unique CDF")
# The full chiptype of the unique CDF
chipType <- sprintf("%s,unique", getChipType(this))
verbose && cat(verbose, "Unique chip type: ", chipType)
# First, try to locate an existing unique CDF
verbose && enter(verbose, "Locating unique CDF")
pathname <- findByChipType(this, chipType=chipType, ...)
verbose && cat(verbose, "Pathname: ", pathname)
verbose && exit(verbose)
if (is.null(pathname)) {
verbose && enter(verbose, "Could not locate unique CDF. Will create one for chip type")
res <- createUniqueCdf(this, ..., verbose=less(verbose))
verbose && exit(verbose)
} else {
res <- byChipType(this, chipType=chipType, ...)
}
verbose && exit(verbose)
res
}, protected=TRUE)
setMethodS3("isUniqueCdf", "AffymetrixCdfFile", function(this, ...) {
hasTag(this, "unique")
})
# equals(getMainCdf(getUniqueCdf(cdf), cdf)) == TRUE
#setMethodS3("getMainCdf", "AffymetrixCdfFile", function(this, ...) {
# # Argument 'this':
# if (!isUniqueCdf(this)) {
# throw("Cannot get the main CDF, because this CDF is not a unique CDF: ", getPathname(this))
# }
#
# chipType <- getChipType(this, fullname=FALSE)
# tags <- getTags(this)
# tags <- setdiff(tags, c("unique"))
#
# # Try to locate the main CDF
# cdf <- byChipType(this, chipType=chipType, tags=tags, ...)
#
# cdf
#}, protected=TRUE)
setMethodS3("getUnitGroupCellMapWithUnique", "AffymetrixCdfFile", function(this, units=NULL, ..., ugcMapM=NULL, verbose=FALSE) {
# Argument 'this':
if (isUniqueCdf(this)) {
throw("Argument 'this' is already a unique CDF: ", getPathname(this))
}
# Argument 'units':
units0 <- units
# Argument 'ugcMapM':
if (is.null(ugcMapM)) {
cdfM <- getUniqueCdf(this)
} else if (!isUnitGroupCellMap(ugcMapM)) {
throw("Argument 'ugcMapM' is not a UnitGroupCellMap")
}
# Merge maps with ordered and unique units (and the expand in the end)
if (!is.null(units))
units <- sort(unique(units))
ugcMap <- list()
# Get the (unit, group, cell) map for this CDF
ugcMap[[1]] <- getUnitGroupCellMap(this, units=units, verbose=verbose)
# Get the (unit, group, cell) map for the unique CDF
if (is.null(ugcMapM)) {
ugcMap[[2]] <- getUnitGroupCellMap(cdfM, units=units, verbose=verbose)
} else {
ugcMap[[2]] <- ugcMapM
}
# Expand the unique CDF (unit, group, cell) map to the main one
# (1) Create a (unit, group) -> hashcode map
MAXGROUP <- 10000
ugHashcode <- lapply(ugcMap, FUN=function(map) {
units <- map[,"unit"]
groups <- map[,"group"]
hashcode <- MAXGROUP*units + groups
hashcode
})
# (2) Map (unit, group)_unique to (unit, group)_main
rr <- match(ugHashcode[[1]], ugHashcode[[2]])
# Not needed anymore
ugHashcode <- NULL
mergedMap <- cbind(ugcMap[[1]], cellM=ugcMap[[2]][rr,"cell"])
# Not needed anymore
ugcMap <- NULL
mergedMap
}, protected=TRUE)
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.