#' @title Estimate Size Factors of Samples
#'
#' @description This function estimates sample size factors for normalization
#' purpose in downsteam analysis. Size factors of sample pairs are estimated
#' firstly by comparing samples to one reference sample (i.e. the sample
#' corresponding to first column of count matrix). Then, size factors are
#' combined across all samples with the median size factor as 1. In detail,
#' binding type is first estimated using the same strategy as function
#' \code{chipType} for each sample pair. When biomodel, size factor is
#' calcualted based on a decision on which kernale density mode to be used for
#' scaling.
#'
#' @param count A matrix of read counts or a SummarizedExperiment, where
#' columns are samples and rows are peaks or high coverage bins. This object
#' can be generated by function \code{regionReads}.
#' @param cutoff A numeric cut off on count matrix. If positive, only
#' peaks/bins with counts larger than \code{cutoff} in at least one sample are
#' used to estimate the size factors. We recommend a larger cutoff since
#' background signal can dramatically mask the right estimation of kernal
#' density, especially for deep sequenced ChIP-seq samples. (Default: 50)
#' @param fold A numeric threshold to help determining the binding type. In
#' detail, if top 2 estimated modes on smoothed kernal density have a height
#' differece less than the folds given by \code{fold}, binding type will be
#' determined as bimodel; otherwise, it is unimodel. This number should be
#' larger than 1. (Default: 10)
#' @param h Initial smoothing factor when estimating kernal density for bump
#' hunting. (Default: 0.1)
#' @param plot A logical indicator that if M-A plot and smoothed kernal density
#' should be visualized. (Default: FALSE)
#' @param sanity A logical indicator if checking sanity across replicates in
#' the same conditions. A negative report of sanity check indicates either a
#' bad experiment (e.g. binding type is not consistent across replicates) or a
#' bad initiation of function parameters (e.g. \code{cutoff} and \code{fold}
#' are not pre-estimated well). However, a negative report of sanity check
#' doesn't neccessarily mean a bad estimation of size factors, as the strategy
#' of hunting kernal density mode is robust to find the right scaling
#' regardless of unimodel or bimodel. (Default: FALSE)
#' @param cond \code{NULL} or a two-level factor specifying biological
#' conditions for samples in \code{count} (e.g. control & treatment). This
#' parameter is different from the meta information in \code{count} when
#' \code{count} is a SummarizedExperiment. It only includes information of
#' conditions, and should be a factor object. This parameter is only appliable
#' when \code{sanity} is TRUE. (Default: NULL)
#'
#' @importFrom matrixStats rowMaxs
#' @importFrom matrixStats rowDiffs
#' @import SummarizedExperiment
#' @importFrom methods is
#' @importFrom graphics abline
#' @importFrom graphics layout
#' @importFrom stats dnorm
#'
#' @return
#' A list with the following conponents:
#' \item{sizefac}{A numeric vector indicating estimated size factors of
#' samples}
#' \item{type}{A character vector with value either "bimodel" or "unimodel",
#' indicating the binding types by comparing ro the sample "control"}
#'
#' @export
#'
#' @examples
#' ## load sample data
#' data(complex)
#' names(complex)
#'
#' ## test sample data
#' sizeFac(count=complex$counts)
sizeFac <- function(count, cutoff=50L, fold=10, h=0.1, plot=FALSE,
sanity=FALSE, cond=NULL){
stopifnot((is.matrix(count) || is(count,"SummarizedExperiment")) &&
ncol(count) >= 2)
stopifnot(is.numeric(cutoff) && length(cutoff) == 1 && cutoff >= 0)
stopifnot(is.numeric(fold) && length(fold) == 1 && fold > 1)
stopifnot(is.numeric(h) && length(h) == 1 && h > 0)
stopifnot(is.logical(plot) && length(plot) == 1)
stopifnot(is.logical(sanity) && length(sanity) == 1)
if(sanity){
stopifnot(is.factor(cond) && nlevels(cond) == 2 &&
ncol(count) == length(cond))
}
## loop on each sample pair
if(is(count,"SummarizedExperiment")) count <- assay(count,1)
sizefacs <- 1
cmplxtypes <- 'unimodel'
for(i in seq_len(ncol(count))[-1]){
countpair <- count[,c(1,i)]
## raw M & A
counttmp <- countpair[rowMaxs(countpair) >= cutoff,]
logcount <- log2(counttmp + 0.5)
M <- rowDiffs(logcount)
A <- rowMeans(logcount)
## kernal density bumps
bump <- c()
hpair <- h
ix <- seq(round(min(M), 1), max(M), 0.05)
while(length(bump)<2){
dm <- rowMeans(sapply(M, function(m) dnorm(ix, mean=m, sd=hpair)))
bump <- which(diff(sign(diff(dm))) == -2) + 1
bump2 <- bump[order(dm[bump],
decreasing=TRUE)[seq_len(
min(2,length(bump)))]]
mu <- ix[bump2]
mudm <- dm[bump2]
hpair <- hpair * 0.8
}
## chip type
enrich <- abs(log(mudm[1] / mudm[2]))
if(enrich <= log(fold)){
cmplxtype <- "bimodel"
}else{
cmplxtype <- "unimodel"
}
## size factor
M0idx <- 1
if(cmplxtype == "bimodel"){
Mtmp <- M[A > max(A) * 0.8]
ind <- vector('integer',length=length(Mtmp))
ind[abs(Mtmp-mu[1]) <= abs(Mtmp-mu[2])] <- 1L
if(sum(ind) < length(ind)/2){
M0idx <- 2
}
}
M0 <- mu[M0idx]
sizefac <- c(1,2^M0)
## plots
if(plot){
layout(matrix(1:2,1,2))
plot(A,M,pch=20,cex=0.5,main=cmplxtype)
abline(h=0,lty=2,col='red',lwd=2)
plot(density(M,na.rm = TRUE,adjust=1),xlab='M',
main=cmplxtype,lwd=2)
if(cmplxtype == "bimodel")
abline(v=mu,lty=2,col='blue',lwd=2)
else abline(v=M0,lwd=2,col='blue',lty=2)
}
sizefacs <- c(sizefacs,sizefac[2])
cmplxtypes <- c(cmplxtypes,cmplxtype)
}
## sanity check
if(sanity){
sanind <- all(sapply(levels(cond),function(x)
length(unique(cmplxtypes[cond==x]))==1))
if(sanind) cat("Sanity check passed...\n")
else cat("Sanity check failed...\n")
}
sizefacs <- sizefacs / median(sizefacs)
cmplxtypes[1] <- "control"
list(sizefac=sizefacs,type=cmplxtypes)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.