R/cell_composition.R
In sesame: Tools For Analyzing Illumina Infinium DNA Methylation Arrays

Documented in diffRefSet estimateCellComposition estimateLeukocyte getRefSet twoCompsEst2

## clean reference set to non NA sites
cleanRefSet <- function(g, platform = c('EPIC','HM450','HM27')) {

    platform <- match.arg(platform)
    pkgTest('GenomicRanges')
    mapinfo <- sesameDataGet(paste0(
        platform, '.probeInfo'))[[paste0('mapped.probes.hg19')]]
    g <- g[intersect(rownames(g), names(mapinfo)),,drop=FALSE]
    g.clean <- g[apply(g, 1, function(x) !any(is.na(x))),,drop=FALSE]
    g.clean <- g.clean[rownames(g.clean) %in% names(mapinfo),,drop=FALSE]
    g.clean <- g.clean[!(as.vector(GenomicRanges::seqnames(
        mapinfo[rownames(g.clean)])) %in% c('chrX','chrY','chrM')),,drop=FALSE]
    g.clean <- g.clean[grep('cg',rownames(g.clean)),,drop=FALSE]
    g.clean
}

#' Restrict refset to differentially methylated probes
#' use with care, might introduce bias
#'
#' The function takes a matrix with probes on the rows and cell types on
#' the columns and output a subset matrix and only probes that show
#' discordant methylation levels among the cell types.
#' 
#' @param g a matrix with probes on the rows and cell types on the columns
#' @return g a matrix with a subset of input probes (rows)
#' @examples
#' g <- diffRefSet(getRefSet(platform='HM450'))
#' @export
diffRefSet <- function(g) {
    g <- g[apply(g, 1, function(x) min(x) != max(x)),]
    message(
        'Reference set is based on ', dim(g)[1], ' differential probes from ',
        dim(g)[2], ' cell types.')
    g
}

#' Retrieve reference set
#'
#' The function retrieves the curated reference DNA methylation status for
#' a set of cell type names under the Infinium platform. Supported cell types
#' include "CD4T", "CD19B", "CD56NK", "CD14Monocytes", "granulocytes", "scFat",
#' "skin" etc. See package sesameData for more details. The function output a
#' matrix with probes on the rows and specified cell types on the columns.
#' 0 suggests unmethylation and 1 suggests methylation. Intermediate
#' methylation and nonclusive calls are left with NA. 
#'
#' @param cells reference cell types
#' @param platform EPIC or HM450
#' @import stats
#' @return g, a 0/1 matrix with probes on the rows and specified cell types
#' on the columns.
#' @examples
#' betas <- getRefSet('CD4T', platform='HM450')
#' @export
getRefSet <- function(cells=NULL, platform = c('EPIC','HM450')) {
    
    platform <- match.arg(platform)
    if (is.null(cells)) {
        cells <- c('CD4T', 'CD19B','CD56NK','CD14Monocytes', 'granulocytes');
    }

    refdata <- sesameDataGet('ref.methylation')[cells]
    probes <- Reduce(intersect, lapply(refdata, names))
    g <- do.call(cbind, lapply(refdata, function(x) x[probes]))
    g <- cleanRefSet(g, platform);
    message(
        'Reference set is based on ', dim(g)[1], ' probes from ',
        dim(g)[2], ' cell types.')
    g
}

errFunc <- function(f, g, q) {
    gamma <- q - g %*% f[2:length(f)]
    sum(ifelse(gamma < f[1] / 2, abs(gamma), abs(gamma - f[1])), na.rm=TRUE)
}

## transform fraction (f) by altering 2 components (nu1 and nu2) by step.size
double.transform.f <- function(f, nu1, nu2, step.size) {
    if (f[nu1] + step.size > 1) return(NULL);
    if (f[nu2] - step.size < 0) return(NULL);
    f[nu1] <- f[nu1] + step.size
    f[nu2] <- f[nu2] - step.size
    f[1] <- 1 - sum(f[2:length(f)]) # renormalize to avoid numerical drift
    f
}

dichotomize <- function(q, rmin=0.15, rmax=0.85) {
    q[q > rmax] <- 1
    q[q < rmin] <- 0
    middle <- na.omit(which(q >= rmin & q <= rmax))
    q[middle] <- (q[middle]-rmin) / (rmax-rmin)
    q
}

getg0 <- function(f, g, q) {
    gamma <- q - g %*% f[2:length(f)]
    ifelse(gamma < f[1] / 2, 0, 1)
}

## @param frac0 initial fraction
.optimizeCellComposition <- function(
    g, q, frac0=NULL, temp=0.5, maxIter=1000,
    delta=0.0001, step.max=1.0, verbose=FALSE) {

    M <- ncol(g) # number of reference
    if (is.null(frac0)) {
        frac <- c(1, rep(0, M))
    } else { # use given fraction estimate
        frac <- frac0
    }
    
    ## initialize
    errcurrent <- errFunc(frac, g, q)
    errmin <- errcurrent
    frac.min <- frac
    niter <- 1

    ## initialize
    repeat {
        nu <- sample(seq_len(M+1), 2)
        step.size <- runif(1) * step.max
        frac.test <- double.transform.f(frac, nu[1], nu[2], step.size);
        if (!is.null(frac.test)) {

            if (verbose) {
                message(
                    'errcurrent=', errcurrent, 'frac=',
                    paste(lapply(
                        frac,
                        function(x) sprintf('%1.2f', x)), collapse='-'),
                    ';stepsize=', step.size, ';temp=', temp, ';best=',
                    paste(lapply(
                        frac.min,
                        function(x) sprintf('%1.2f', x)), collapse='-'),
                    ';err=', errmin)
            }

            errtest <- errFunc(frac.test, g, q)
            
            ## update best
            if (errtest < errmin) {
                errmin <- errtest
                frac.min <- frac.test
                if ((errmin-errtest) > errmin*delta)
                    niter <- 1;
            } else {
                niter <- niter + 1
                if (niter > maxIter) {
                    break;
                }
            }
            
            ## rejection sampling
            if (runif(1) < exp(-(errtest-errcurrent)/temp)) {
                errcurrent <- errtest
                frac <- frac.test
            }
        }
    }

    list(frac.min=frac.min, errmin=errmin)
}

#' Estimate cell composition using reference
#'
#' This is a reference-based cell composition estimation. The function takes a
#' reference methylation status matrix (rows for probes and columns for cell
#' types, can be obtained by getRefSet function) and a query beta value
#' measurement. The length of the target beta values should be the same as
#' the number of rows of the reference matrix. The method assumes one unknown
#' component. It outputs a list containing the estimated cell fraction, the
#' error of optimization and methylation status of the unknown component.
#'
#' @param g reference methylation
#' @param q target measurement: length(q) == nrow(g)
#' @param dichotomize to dichotomize query beta value before estimate,
#' this relieves unclean background subtraction
#' @param refine to refine estimate, takes longer
#' @param ... extra parameters for optimization, this includes
#' temp - annealing temperature (0.5)
#' maxIter - maximum iteration to stop after converge (1000)
#' delta - delta score to reset counter (0.0001)
#' verbose - output debug info (FALSE)
#' @return a list of fraction, min error and unknown component methylation state
## @examples
## g <- diffRefSet(getRefSet(platform='HM450'))
## M <- ncol(g)
## trueFrac <- runif(M+1)
## trueFrac <- trueFrac / sum(trueFrac)
## g0 <- sample(c(0,1), nrow(g), replace=TRUE)
## q <- cbind(g0, g) %*% trueFrac + rnorm(length(g0), mean=0, sd = 0.0)
## q[q<0] <- 0
## q[q>1] <- 1
## est <- estimateCellComposition(g, q)
## @export - TODO fix colinearity
estimateCellComposition <- function(
    g, q, refine=TRUE, dichotomize=FALSE, ...) {

    if (dichotomize) {
        q <- dichotomize(q);
    }

    ## raw
    res <- .optimizeCellComposition(g, q, step.max=1, ...);
    ## refine
    res <- .optimizeCellComposition(
        g, q, frac0=res$frac.min, step.max=0.05, ...)

    list(
        frac = setNames(res$frac.min, c("unknown", colnames(g))),
        err = res$errmin,
        g0 = getg0(res$frac.min, g, q))
}


#' Estimate leukocyte fraction using a two-component model
#'
#' The method assumes only two components in the mixture: the leukocyte
#' component and the target tissue component. The function takes the beta
#' values matrix of the target tissue and the beta value matrix of the
#' leukocyte. Both matrices have probes on the row and samples on the column.
#' Row names should have probe IDs from the platform. The function outputs
#' a single numeric describing the fraction of leukocyte.
#'
#' @param betas.tissue tissue beta value matrix (#probes X #samples)
#' @param betas.leuko leukocyte beta value matrix,
#' if missing, use the SeSAMe default by infinium platform
#' @param betas.tumor optional, tumor beta value matrix
#' @param platform "HM450", "HM27" or "EPIC"
#' @return leukocyte estimate, a numeric vector
#' @importFrom utils head
#' @importFrom utils tail
#' @examples
#'
#' betas.tissue <- sesameDataGet('HM450.1.TCGA.PAAD')$betas
#' estimateLeukocyte(betas.tissue)
#' 
#' @export
estimateLeukocyte<-function(
    betas.tissue, betas.leuko = NULL,
    betas.tumor = NULL,
    platform = c('EPIC','HM450','HM27')){
    
    platform <- match.arg(platform)

    if (!is.matrix(betas.tissue))
        betas.tissue <- as.matrix(betas.tissue)

    if (is.null(betas.leuko)) {
        betas.leuko <- sesameDataGet('leukocyte.betas')[[platform]]
    }

    if (!is.matrix(betas.leuko))
        betas.leuko <- as.matrix(betas.leuko)

    ## choose the probes to work with
    ave.leuko <- rowMeans(betas.leuko, na.rm=TRUE)
    ave.tissue <- rowMeans(betas.tissue, na.rm=TRUE)

    probes <- intersect(names(ave.leuko), names(ave.tissue))
    ave.leuko <- ave.leuko[probes]
    ave.tissue <- ave.tissue[probes]

    if (toupper(platform) %in% c("HM450", "EPIC"))
        nprobes <- 1000
    else if (toupper(platform) == "HM27")
        nprobes <- 100
    else
        stop("Platform not supported.")
    
    tt <- sort(ave.leuko - ave.tissue)
    ## leukocyte-specific hypo-methylation
    probes.leuko.low <- names(head(tt, n=nprobes))
    ## leukocyte-specific hyper-methylation
    probes.leuko.high <- names(tail(tt, n=nprobes))

    ## test tumor if specified
    if (!is.null(betas.tumor)) {
        if (!is.matrix(betas.tumor))
            betas.tumor <- as.matrix(betas.tumor)
        betas.tissue <- betas.tumor
    }
        
    ## more refined range if dataset is big
    if (dim(betas.tissue)[2] >= 10) {
        ## the old code doesn't exclude NAs, this should be better
        t.high <- apply(betas.tissue[probes.leuko.high,],1,min, na.rm=TRUE)
        t.low <- apply(betas.tissue[probes.leuko.low,],1,max, na.rm=TRUE)
    } else {
        t.high <- rep(0.0, length(probes.leuko.high))
        t.low <- rep(1.0, length(probes.leuko.low))
    }
    l.high <- as.numeric(as.matrix(ave.leuko[probes.leuko.high]))
    l.low <- as.numeric(as.matrix(ave.leuko[probes.leuko.low]))

    ## calculating Leukocyte Percentage
    leuko.estimate <- vapply(seq_len(ncol(betas.tissue)), function(i) {
        s.high <- betas.tissue[probes.leuko.high, i]
        s.low <- betas.tissue[probes.leuko.low, i]
        p <- c(
            (s.high - t.high) / (l.high - t.high),
            (s.low - t.low) / (l.low - t.low))

        if (sum(!is.na(p)) < 10) return(NA); # not enough data
        dd <- density(na.omit(p))
        dd$x[which.max(dd$y)]
    }, numeric(1))
    
    names(leuko.estimate) <- colnames(betas.tissue)
    leuko.estimate
}

twoCompsDiff <- function(pop1, pop2) {
    pb <- intersect(rownames(pop1), rownames(pop2))
    pop1 <- pop1[pb,]
    pop2 <- pop2[pb,]
    tt <- sort(rowMeans(pop1) - rowMeans(pop2))
    res <- list(
        diff_1m2u = names(tail(tt, n=1000)),
        diff_1u2m = names(head(tt, n=1000)))
    res
}


#' Estimate the fraction of the 2nd component in a 2-component mixture
#' 
#' @param pop1 Reference methylation level matrix for population 1
#' @param pop2 Reference methylation level matrix for population 2
#' @param target Target methylation level matrix to be analyzed
#' @param diff_1m2u A vector of differentially methylated probes (methylated
#' in population 1 but unmethylated in population 2)
#' @param diff_1u2m A vector of differentially methylated probes (unmethylated
#' in population 1 but methylated in population 2)
#' @param use.ave use population average in selecting differentially
#' methylated probes
#' @return Estimate of the 2nd component in the 2-component mixture
twoCompsEst2 <- function(
    pop1, pop2, target, use.ave=TRUE, diff_1m2u=NULL, diff_1u2m=NULL) {

    pb <- intersect(
        intersect(rownames(pop1), rownames(pop2)),
            rownames(target))
    cat(length(pb), "probes are shared among data sets. Starting from there.\n")
    
    pop1 <- pop1[pb,]
    pop2 <- pop2[pb,]
    target <- target[pb,]

    if (is.null(diff_1m2u) || is.null(diff_1u2m)) {
        if (use.ave) {
            tt <- sort(rowMeans(pop1) - rowMeans(pop2))
            diff_1m2u <- names(tail(tt, n=1000))
            diff_1u2m <- names(head(tt, n=1000))
        } else {
            diff_1u2m <- names(which(
                apply(pop1,1,function(x) {
                    all(x<0.3, na.rm=TRUE) && sum(is.na(x)) / length(x) < 0.5
                }) & apply(pop2,1,function(x) {
                    all(x>0.7, na.rm=TRUE) && sum(is.na(x)) / length(x) < 0.5
                })))
            
            diff_1m2u <- names(which(
                apply(pop1,1,function(x) {
                    all(x>0.7, na.rm=TRUE) && sum(is.na(x)) / length(x) < 0.5
                }) & apply(pop2,1,function(x) {
                    all(x<0.3, na.rm=TRUE) && sum(is.na(x)) / length(x) < 0.5
                })))
        }
    }
    
    cat(length(diff_1u2m),
        "probes methylated in 2 and unmethylated in 1.\n")
    cat(length(diff_1m2u),
        "probes methylated in 1 and unmethylated in 2.\n")

    bnd_1u2m_hi <- rowMaxs(pop2[diff_1u2m,])
    bnd_1u2m_lo <- rowMins(pop1[diff_1u2m,])

    bnd_1m2u_hi <- rowMaxs(pop1[diff_1m2u,])
    bnd_1m2u_lo <- rowMins(pop2[diff_1m2u,])

    est <- vapply(seq_len(ncol(target)), function(i) {
        xx <- c(
        (target[diff_1u2m,i] - bnd_1u2m_lo) / (bnd_1u2m_hi - bnd_1u2m_lo),
        1-(target[diff_1m2u,i] - bnd_1m2u_lo) / (bnd_1m2u_hi - bnd_1m2u_lo))
        
        dd <- density(na.omit(xx))
        dd$x[which.max(dd$y)]
    }, numeric(1))

    names(est) <- colnames(target)
    est
}

Any scripts or data that you put into this service are public.

sesame documentation built on Nov. 15, 2020, 2:08 a.m.

rdrr.io home R language documentation Run R code online

CRAN packages Bioconductor packages R-Forge packages GitHub packages

Note that we can't provide technical support on individual packages. You should contact the package authors for that.

sesame
Tools For Analyzing Illumina Infinium DNA Methylation Arrays

R/cell_composition.R
In sesame: Tools For Analyzing Illumina Infinium DNA Methylation Arrays

Defines functions twoCompsEst2 twoCompsDiff estimateLeukocyte estimateCellComposition .optimizeCellComposition getg0 dichotomize double.transform.f errFunc getRefSet diffRefSet cleanRefSet

Documented in diffRefSet estimateCellComposition estimateLeukocyte getRefSet twoCompsEst2

Try the sesame package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

sesame Tools For Analyzing Illumina Infinium DNA Methylation Arrays

R/cell_composition.R In sesame: Tools For Analyzing Illumina Infinium DNA Methylation Arrays

Defines functions twoCompsEst2 twoCompsDiff estimateLeukocyte estimateCellComposition .optimizeCellComposition getg0 dichotomize double.transform.f errFunc getRefSet diffRefSet cleanRefSet

Documented in diffRefSet estimateCellComposition estimateLeukocyte getRefSet twoCompsEst2

Try the sesame package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

sesame
Tools For Analyzing Illumina Infinium DNA Methylation Arrays

R/cell_composition.R
In sesame: Tools For Analyzing Illumina Infinium DNA Methylation Arrays