###########################################################################/**
# @RdocClass BaseCountNormalization
#
# @title "The BaseCountNormalization class"
#
# \description{
# @classhierarchy
#
# This class represents a normalization method that corrects for systematic
# effects in the probe intensities due to differences in the number of
# A, C, G, and T:s in the probe sequences.
# }
#
# @synopsis
#
# \arguments{
# \item{...}{Arguments passed to the constructor of
# @see "AbstractProbeSequenceNormalization".}
# \item{model}{A @character string specifying the model used to fit
# the base-count effects.}
# \item{bootstrap}{If @TRUE, the model fitting is done by bootstrap in
# order to save memory.}
# }
#
# \section{Fields and Methods}{
# @allmethods "public"
# }
#
# \section{Requirements}{
# This class requires that an aroma probe sequence file is available
# for the chip type.
# }
#
# @author "HB"
#*/###########################################################################
setConstructorS3("BaseCountNormalization", function(..., model=c("robustSmoothSpline", "lm"), bootstrap=FALSE) {
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
# Validate arguments
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
# Argument 'model':
model <- match.arg(model)
# Argument 'bootstrap':
bootstrap <- Arguments$getLogical(bootstrap)
if (bootstrap && model == "lm") {
throw("Bootstrapping for the 'lm' model is not implemented.")
}
extend(AbstractProbeSequenceNormalization(...), "BaseCountNormalization",
.model = model,
.bootstrap=bootstrap,
.chunkSize=as.integer(2.5e6),
.maxIter=as.integer(50),
.acc=0.005
)
})
setMethodS3("getAsteriskTags", "BaseCountNormalization", function(this, collapse=NULL, ...) {
tags <- NextMethod("getAsteriskTags", collapse=NULL)
# Add model tag?
model <- this$.model
if (model != "robustSmoothSpline") {
tags <- c(tags, model)
}
# Add bootstrap tag?
if (this$.bootstrap) {
bootstrapTag <- "B"
tags <- c(tags, bootstrapTag)
}
# Collapse?
tags <- paste(tags, collapse=collapse)
tags
}, protected=TRUE)
setMethodS3("getParameters", "BaseCountNormalization", function(this, ...) {
# Get parameters from super class
params <- NextMethod("getParameters")
params <- c(params, list(
model = this$.model,
bootstrap = this$.bootstrap,
chunkSize = this$.chunkSize,
maxIter = this$.maxIter,
acc = this$.acc
))
params
}, protected=TRUE)
setMethodS3("getDesignMatrix", "BaseCountNormalization", function(this, cells=NULL, model=NULL, ..., force=FALSE, cache=TRUE, verbose=FALSE) {
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
# Validate arguments
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
# Argument 'cells':
if (is.null(cells)) {
} else {
# Validated below...
}
# Argument 'verbose':
verbose <- Arguments$getVerbose(verbose)
if (verbose) {
pushState(verbose)
on.exit(popState(verbose))
}
verbose && enter(verbose, "Retrieving design matrix")
verbose && cat(verbose, "Cells:")
verbose && str(verbose, cells)
verbose && cat(verbose, "Model: ", model)
# Retrieve nucleotide counts as raw values whenever
# possible to save memory.
if (model == "lm") {
mode <- "integer"
oneValue <- as.integer(1)
} else if (model == "robustSmoothSpline") {
mode <- "raw"
oneValue <- as.raw(1)
}
verbose && enter(verbose, "Locating probe-sequence annotation data")
# Locate AromaCellSequenceFile holding probe sequences
acs <- getAromaCellSequenceFile(this, verbose=less(verbose, 5))
verbose && exit(verbose)
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
# Check file cache
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
key <- list(
method="getDesignMatrix", class=class(this[1]),
cells=cells,
model=model,
acs=list(fullname=getFullName(acs))
)
dirs <- c("aroma.affymetrix", getChipType(acs))
if (!force) {
X <- loadCache(key=key, dirs=dirs)
if (!is.null(X)) {
verbose && cat(verbose, "Cached results found.")
verbose && exit(verbose)
return(X)
}
}
verbose && enter(verbose, "Count nucleotide bases for *all* cells")
verbose && cat(verbose, "Chip type: ", getChipType(acs))
designMatrix <- countBases(acs, mode=mode, verbose=less(verbose, 5))
# Not needed anymore
acs <- NULL
verbose && cat(verbose, "Nucleotide base counts:")
verbose && str(verbose, designMatrix)
verbose && cat(verbose, "object.size(designMatrix): ",
object.size(designMatrix))
verbose && exit(verbose)
if (!is.null(cells)) {
verbose && enter(verbose, "Extracing sequences of interest")
verbose && cat(verbose, "Cells:")
verbose && str(verbose, cells)
nbrOfCells <- nrow(designMatrix)
cells <- Arguments$getIndices(cells, max=nbrOfCells)
designMatrix <- designMatrix[cells,,drop=FALSE]
# Not needed anymore
cells <- NULL
# Garbage collect
gc <- gc()
verbose && print(verbose, gc)
verbose && cat(verbose, "object.size(designMatrix): ",
object.size(designMatrix))
verbose && exit(verbose)
}
designMatrix[,1] <- oneValue
verbose && cat(verbose, "Design matrix:")
verbose && str(verbose, designMatrix)
verbose && cat(verbose, "object.size(designMatrix): ",
object.size(designMatrix))
# Garbage collect
gc <- gc()
verbose && print(verbose, gc)
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
# Cache results?
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
if (cache) {
saveCache(X, key=key, dirs=dirs)
}
verbose && exit(verbose)
designMatrix
}, private=TRUE)
setMethodS3("fitOne", "BaseCountNormalization", function(this, df, ..., verbose=FALSE) {
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
# Local functions
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
# Arguments 'y' and 'X' must not contain NAs.
fitBaseCounts <- function(y, X, subset=NULL, model=c("robustSmoothSpline", "lm"), ...) {
# Argument 'y':
# Argument 'X':
if (nrow(X) != length(y)) {
throw("The number of rows in design matrix 'X' does not match the number of observations in 'y': ", nrow(X), " != ", length(y))
}
if (!is.null(subset)) {
y <- y[subset]
X <- X[subset,,drop=FALSE]
gc <- gc()
}
# Argument 'model':
model <- match.arg(model)
if (model == "lm") {
requireNamespace("stats") || throw("Package not loaded: stats")
lm.fit <- stats::lm.fit
fitFcn <- function(X, y, ...) {
fit <- lm.fit(x=X, y=y, ...)
# Remove redundant parameters
for (ff in c("residuals", "effects", "fitted.values", "qr")) {
fit[[ff]] <- NULL
}
fit
}
} else if (model == "robustSmoothSpline") {
fitFcn <- function(X, y, ...) {
fits <- list()
for (cc in 1:ncol(X)) {
# Fit effect of term #cc
if (cc == 1) {
mu <- median(y)
fit <- list(mu=mu)
} else {
# Note: 'X' may be a "raw" matrix (to save memory)
Xcc <- as.double(X[,cc])
fit <- .robustSmoothSpline(x=Xcc, y=y, ...)
## # Remove redundant parameters (although really small here)
## for (ff in c("x", "y", "w", "yin", "lev")) {
## fit[[ff]] <- NULL
## }
mu <- predict(fit, x=Xcc)$y
# Not needed anymore
Xcc <- NULL
}
# Remove the effect of term #cc
y <- y - mu
# Not needed anymore
mu <- NULL
fits[[cc]] <- fit
# Not needed anymore
fit <- NULL
}
fits
}
}
fit <- fitFcn(X, y)
fit
} # fitBaseCounts()
fitSubset <- function(df, cells=NULL, ..., verbose) {
verbose && enter(verbose, "Reading signals to fit")
verbose && cat(verbose, "Cells:")
verbose && str(verbose, cells)
y <- extractMatrix(df, cells=cells, drop=TRUE, verbose=less(verbose, 10))
verbose && exit(verbose)
if (shift != 0) {
verbose && enter(verbose, "Shifting signals")
verbose && cat(verbose, "Shift: ", shift)
y <- y + shift
verbose && exit(verbose)
}
verbose && enter(verbose, "Log2 transforming signals")
y <- log2(y)
verbose && cat(verbose, "Target log2 probe signals:")
verbose && str(verbose, y)
verbose && exit(verbose)
verbose && enter(verbose, "Identify finite data points")
n <- length(y)
# Fit only finite subset
keep <- which(is.finite(y))
y <- y[keep]
cells <- cells[keep]
# Not needed anymore
keep <- NULL
n2 <- length(y)
verbose && printf(verbose, "Removed %d (%.4f%%) out of %d non-finite data points: ", n-n2, 100*(n-n2)/n, n)
gc <- gc()
verbose && exit(verbose)
verbose && enter(verbose, "Fitting base-count model")
X <- getDesignMatrix(this, cells=cells, model=model, verbose=less(verbose, 5))
# Not needed anymore
cells <- NULL
verbose && cat(verbose, "Design matrix:")
verbose && str(verbose, X)
gc <- gc()
verbose && print(verbose, gc)
fit <- fitBaseCounts(y, X=X, model=model, verbose=less(verbose, 5))
# Not needed anymore
y <- X <- NULL
verbose && print(verbose, fit)
verbose && exit(verbose)
fit
} # fitSubset()
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
# Validate arguments
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
# Argument 'df':
df <- Arguments$getInstanceOf(df, "AffymetrixCelFile")
# Argument 'verbose':
verbose <- Arguments$getVerbose(verbose)
if (verbose) {
pushState(verbose)
on.exit(popState(verbose))
}
verbose && enter(verbose, "Fitting normalization function for one array")
verbose && cat(verbose, "Full name: ", getFullName(df))
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
# Getting algorithm parameters
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
verbose && enter(verbose, "Getting algorithm parameters")
params <- getParameters(this, expand=TRUE, verbose=less(verbose, 5))
gc <- gc()
verbose && print(verbose, gc)
units <- params$unitsToFit
verbose && cat(verbose, "Units:")
verbose && str(verbose, units)
cells <- params$cellsToFit
stratifyBy <- params$typesToFit
verbose && cat(verbose, "stratifyBy: ", stratifyBy)
verbose && cat(verbose, "Cells:")
verbose && str(verbose, cells)
shift <- params$shift
verbose && cat(verbose, "Shift: ", shift)
# Other model parameters
model <- params$model
verbose && cat(verbose, "Model: ", model)
bootstrap <- params$bootstrap
chunkSize <- params$chunkSize
maxIter <- params$maxIter
acc <- params$acc
verbose && exit(verbose)
nbrOfCells <- length(cells)
# Is bootstrapping necessary?
if (chunkSize >= nbrOfCells) {
verbose && cat(verbose, "Bootstrapping not really needed.")
bootstrap <- FALSE
}
# Bootstrap?
if (bootstrap) {
verbose && enter(verbose, "Fitting model using bootstrap")
verbose && cat(verbose, "Max number of iterations: ", maxIter)
verbose && cat(verbose, "Accuracy threshold: ", acc)
bb <- 0
fit <- NULL
deltaMax <- Inf
while (deltaMax > acc && bb < maxIter) {
bb <- bb + 1
verbose && enter(verbose, sprintf("Bootstrap iteration #%d", as.integer(bb)))
verbose && enter(verbose, "Fitting model to subset of data")
verbose && cat(verbose, "Chunk size: ", chunkSize)
subset <- sample(1:nbrOfCells, size=chunkSize)
cellsChunk <- cells[subset]
# Not needed anymore
subset <- NULL
cellsChunk <- sort(cellsChunk)
verbose && cat(verbose, "Cells:")
verbose && str(verbose, cellsChunk)
fitB <- fitSubset(df, cells=cellsChunk, verbose=verbose)
# Not needed anymore
cellsChunk <- NULL
verbose && exit(verbose)
verbose && enter(verbose, "Updating estimates")
if (is.null(fit)) {
fit <- fitB
} else {
# Update parameters for each subfit
deltaMax <- 0
for (kk in seq_along(fit)) {
fitKK <- fit[[kk]]
fitBKK <- fitB[[kk]]
if (kk == 1) {
# Average mean estimates
fitKK$mu <- mean(c(fitKK$mu, fitBKK$mu), na.rm=TRUE)
} else {
# Average spline estimates
# Common x:s
x <- sort(unique(c(fitKK$x, fitBKK$x)))
# Predict y:s based on bootstrap pool and new fit
y <- fitKK$y
yB <- predict(fitBKK, x=x)$y
# Weight them together
yB <- ((bb-1)*y + yB)/bb
delta <- mean(abs(y - yB), na.rm=TRUE)
deltaMax <- max(c(deltaMax, delta), na.rm=TRUE)
fitKK <- list(x=x, y=yB, delta=delta)
# Not needed anymore
x <- y <- yB <- delta <- NULL
}
fit[[kk]] <- fitKK
# Not needed anymore
fitKK <- fitBKK <- NULL
} # for (kk ...)
}
# Not needed anymore
fitB <- NULL
verbose && exit(verbose)
verbose && printf(verbose, "deltaMax < threshold: %.6f < %.6f\n", deltaMax, acc)
verbose && printf(verbose, "iteration < maxIter: %d < %d\n", as.integer(bb), as.integer(maxIter))
verbose && exit(verbose)
} # while()
converged <- (deltaMax <= acc)
verbose && enter(verbose, "Creating final fit")
for (kk in seq(from=2, to=length(fit))) {
fitKK <- fit[[kk]]
fitKK <- .robustSmoothSpline(x=fitKK$x, y=fitKK$y)
fit[[kk]] <- fitKK
}
verbose && exit(verbose)
fit$bootstrap <- list(iter=as.integer(bb), maxIter=maxIter, converged=converged)
verbose && exit(verbose)
} else {
fit <- fitSubset(df, cells=cells, verbose=verbose)
} # if (bootstrap)
verbose && exit(verbose)
fit
}, protected=TRUE)
setMethodS3("predictOne", "BaseCountNormalization", function(this, fit, ..., verbose=FALSE) {
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
# Local functions
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
# Arguments 'y' and 'X' must not contain NAs.
predictBaseCounts <- function(fit, X, model=c("robustSmoothSpline", "lm"), ...) {
# Argument 'model':
model <- match.arg(model)
if (model == "lm") {
predictFcn <- function(fit, X, ...) {
coefs <- coefficients(fit)
coefs <- as.matrix(coefs)
yPred <- X %*% coefs
yPred
} # predictFcn()
} else if (model == "robustSmoothSpline") {
predictFcn <- function(fit, X, ...) {
fits <- fit
yPred <- double(nrow(X))
for (cc in 1:ncol(X)) {
fit <- fits[[cc]]
if (cc == 1) {
mu <- fit$mu
} else {
mu <- rep(NA_real_, times=nrow(X))
if (mode(X) == "raw") {
# Note: 'X' may be a "raw" matrix (to save memory)
idxs <- which(X[,cc] != as.raw(255))
} else {
idxs <- which(is.finite(X[,cc]))
}
x <- as.double(X[idxs,cc])
mu[idxs] <- predict(fit, x=x)$y
# Not needed anymore
idxs <- x <- NULL
}
str(mu)
yPred <- yPred + mu
# Not needed anymore
fit <- mu <- NULL
} # for (cc ...)
yPred
} # predictFcn()
}
yPred <- predictFcn(fit, X)
yPred
} # predictBaseCounts()
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
# Validate arguments
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
# Argument 'verbose':
verbose <- Arguments$getVerbose(verbose)
if (verbose) {
pushState(verbose)
on.exit(popState(verbose))
}
verbose && enter(verbose, "Predicting model for one array")
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
# Getting algorithm parameters
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
verbose && enter(verbose, "Getting algorithm parameters")
params <- getParameters(this, expand=TRUE, verbose=less(verbose, 5))
gc <- gc()
verbose && print(verbose, gc)
units <- params$unitsToUpdate
verbose && cat(verbose, "Units:")
verbose && str(verbose, units)
cells <- params$cellsToUpdate
stratifyBy <- params$typesToUpdate
verbose && cat(verbose, "stratifyBy: ", stratifyBy)
verbose && cat(verbose, "Cells:")
verbose && str(verbose, cells)
shift <- params$shift
verbose && cat(verbose, "Shift: ", shift)
# Other model parameters
model <- params$model
verbose && cat(verbose, "Model: ", model)
bootstrap <- params$bootstrap
chunkSize <- params$chunkSize
maxIter <- params$maxIter
acc <- params$acc
verbose && exit(verbose)
nbrOfCells <- length(cells)
verbose && enter(verbose, "Predicting mean log2 probe signals")
X <- getDesignMatrix(this, cells=cells, model=model, verbose=less(verbose, 5))
# Not needed anymore
cells <- NULL
verbose && cat(verbose, "Design matrix:")
verbose && str(verbose, X)
mu <- predictBaseCounts(fit, X=X, model=model)
# Not needed anymore
fit <- X <- NULL
verbose && str(verbose, "mu:")
verbose && str(verbose, mu)
verbose && exit(verbose)
# Sanity check
if (length(mu) != nbrOfCells) {
throw("Internal error. Number of estimated means does not match the number of cells to be updated: ", length(mu), " != ", nbrOfCells)
}
# Garbage collection
gc <- gc()
verbose && print(verbose, gc)
verbose && exit(verbose)
mu
}, protected=TRUE)
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.