R/sesame.R

Defines functions bisConversionControl SigSetToSigDF chipAddressToSignal searchIDATprefixes readControls readIDATpair readIDAT1 inferPlatformFromTango getAFs getAFTypeIbySumAlleles getBetas betasCollapseToPfx SDFcollapseToPfx remove_suffix totalIntensities probeSuccessRate medianTotalIntensity meanIntensity

Documented in betasCollapseToPfx bisConversionControl chipAddressToSignal getAFs getAFTypeIbySumAlleles getBetas meanIntensity medianTotalIntensity probeSuccessRate readIDATpair SDFcollapseToPfx searchIDATprefixes totalIntensities

#' @title
#' Analyze DNA methylation data
#'
#' @description
#' SEnsible and step-wise analysis of DNA methylation data
#'
#' @details
#' This package complements array functionalities that allow
#' processing >10,000 samples in parallel on clusters.
#' @aliases sesame
#' @author
#' Wanding Zhou \email{Wanding.Zhou@vai.org},
#' Hui Shen \email{Hui.Shen@vai.org}
#' Timothy J Triche Jr \email{Tim.Triche@vai.org}
#'
#' @references Zhou W, Triche TJ, Laird PW, Shen H (2018)
## @seealso To appear
#' @examples
#'
#' sdf <- readIDATpair(sub('_Grn.idat','',system.file(
#'     'extdata','4207113116_A_Grn.idat',package='sesameData')))
#'
#' ## The OpenSesame pipeline
#' betas <- openSesame(sdf)
#'
#' @keywords DNAMethylation Microarray QualityControl
#' @return package
#'
"_PACKAGE"

#' Whole-dataset-wide Mean Intensity
#'
#' The function takes one single \code{SigDF} and computes mean
#' intensity of all the in-band measurements. This includes all Type-I
#' in-band measurements and all Type-II probe measurements. Both methylated
#' and unmethylated alleles are considered. This function outputs a single
#' numeric for the mean.
#'
#' Note: mean in this case is more informative than median because
#' methylation level is mostly bimodal.
#'
#' @param sdf a \code{SigDF}
#' @param mask whether to mask probes using mask column
#' @return mean of all intensities
#' @examples
#' sesameDataCache() # if not done yet
#' sdf <- sesameDataGet('EPIC.1.SigDF')
#' meanIntensity(sdf)
#' @export
meanIntensity <- function(sdf, mask = TRUE) {
    stopifnot(all(c("MG","UG","MR","UR") %in% colnames(sdf)))
    s <- signalMU(sdf, mask = mask)
    mean(c(s$M,s$U), na.rm=TRUE)
}

#' Whole-dataset-wide Median Total Intensity (M+U)
#'
#' The function takes one single \code{SigDF} and computes median
#' intensity of M+U for each probe. This function outputs a single
#' numeric for the median.
#'
#' @param sdf a \code{SigDF}
#' @param mask whether to mask probes using mask column
#' @return median of all intensities
#' @examples
#' sesameDataCache() # if not done yet
#' sdf <- sesameDataGet('EPIC.1.SigDF')
#' medianTotalIntensity(sdf)
#' @export
medianTotalIntensity <- function(sdf, mask = TRUE) {
    stopifnot(all(c("MG","UG","MR","UR") %in% colnames(sdf)))
    s <- signalMU(sdf, mask = mask)
    median(c(s$M + s$U), na.rm=TRUE)
}

#' Whole-dataset-wide Probe Success Rate
#'
#' This function calculates the probe success rate using
#' pOOBAH detection p-values. Probes that has a detection p-value
#' higher than a specific threshold are considered failed probes.
#' @param sdf a \code{SigDF}
#' @param max_pval the maximum p-value to consider detection success
#' @param mask whether or not we count the masked probes in SigDF
#' @return a fraction number as probe success rate
#' @examples
#' sesameDataCache() # if not done yet
#' sdf <- sesameDataGet('EPIC.1.SigDF')
#' probeSuccessRate(sdf)
#' @export
probeSuccessRate <- function(sdf, mask = TRUE, max_pval = 0.05) {
    pval <- pOOBAH(sdf, return.pval = TRUE)
    if (mask) { pval <- pval[!sdf$mask] }
    pval <- na.omit(pval)
    stopifnot(length(pval) > 100)
    sum(pval < max_pval) / length(pval)
}

#' M+U Intensities Array
#'
#' The function takes one single \code{SigDF} and computes total
#' intensity of all the in-band measurements by summing methylated and
#' unmethylated alleles. This function outputs a single numeric for the mean.
#' @param sdf a \code{SigDF}
#' @param mask whether to mask probes using mask column
#' @return a vector of M+U signal for each probe
#' @examples
#' sesameDataCache() # if not done yet
#' sdf <- sesameDataGet('EPIC.1.SigDF')
#' intensities <- totalIntensities(sdf)
#' @export
totalIntensities <- function(sdf, mask = FALSE) {
    stopifnot(all(c("MG","UG","MR","UR") %in% colnames(sdf)))
    s <- signalMU(sdf, mask = mask)
    setNames(s$M+s$U, s$Probe_ID)
}

remove_suffix <- function(tok) {
    vapply(strsplit(tok, "_"),
        function(x) x[1], character(1))
}

#' collapse to probe prefix
#'
#' @param sdf a SigDF object
#' @return a data frame with updated Probe_ID
#' @importFrom dplyr slice_min
#' @importFrom dplyr group_by
SDFcollapseToPfx <- function(sdf) {
    sdf$Probe_ID <- remove_suffix(sdf$Probe_ID)
    sdf$pval <- pOOBAH(sdf, return.pval = TRUE)
    ## take the best by p-value
    ## Note that mask = FALSE will override p-value
    slice_min(group_by(sdf, .data[['Probe_ID']]),
        .data[['mask']] + .data[['pval']], n=1, with_ties = FALSE)
}

#' Collapse betas by averagng probes with common probe ID prefix
#'
#' @param betas either a named numeric vector or a numeric matrix
#' (row: probes, column: samples)
#' @param BPPARAM use MulticoreParam(n) for parallel processing
#' @return either named numeric vector or a numeric matrix of collapsed
#' beta value matrix
#' @examples
#'
#' ## input is a matrix
#' m <- matrix(seq(0,1,length.out=9), nrow=3)
#' rownames(m) <- c("cg00004963_TC21", "cg00004963_TC22", "cg00004747_TC21")
#' colnames(m) <- c("A","B","C")
#' betasCollapseToPfx(m)
#'
#' ## input is a vector
#' m <- setNames(seq(0,1,length.out=3),
#'     c("cg00004963_TC21", "cg00004963_TC22", "cg00004747_TC21"))
#' betasCollapseToPfx(m)
#' @export
betasCollapseToPfx <- function(betas, BPPARAM=SerialParam()) {
    if (is.matrix(betas)) {
        pfxes <- remove_suffix(rownames(betas))
        out <- do.call(cbind, bplapply(
            seq_len(ncol(betas)), function(i) {
                vapply(split(betas[,i], pfxes), mean, numeric(1), na.rm=TRUE)
            }, BPPARAM=BPPARAM))
        colnames(out) <- colnames(betas)
        out
    } else {
        pfxes <- remove_suffix(names(betas))
        vapply(split(betas, pfxes), mean, numeric(1), na.rm=TRUE)
    }
}

#' Get beta Values
#'
#' sum.typeI is used for rescuing beta values on
#' Color-Channel-Switching CCS probes. The function takes a \code{SigDF}
#' and returns beta value except that Type-I in-band signal and out-of-band
#' signal are combined. This prevents color-channel switching due to SNPs.
#' 
#' @param sdf \code{SigDF}
#' @param sum.TypeI whether to sum type I channels
#' @param mask whether to use mask
#' @param collapseToPfx remove replicate to prefix (e.g., cg number) and
#' remove the suffix
#' @param collapseMethod mean or minPval
#' @return a numeric vector, beta values
#' @examples
#' sesameDataCache() # if not done yet
#' sdf <- sesameDataGet('EPIC.1.SigDF')
#' betas <- getBetas(sdf)
#' @export
getBetas <- function(
    sdf, mask=TRUE, sum.TypeI = FALSE, collapseToPfx = FALSE,
    collapseMethod = c("mean", "minPval")) {

    ## TODO: document collapseToPfx = T feature
    stopifnot(all(c("MG","UG","MR","UR") %in% colnames(sdf)))
    collapseMethod <- match.arg(collapseMethod)
    if (collapseToPfx && collapseMethod == "minPval") {
        sdf <- SDFcollapseToPfx(sdf)
    }

    if (sum.TypeI) {
        d1 <- InfI(sdf); d2 <- InfII(sdf)
        betas <- c(setNames(
            pmax(d1$MG+d1$MR,1)/pmax(d1$MG+d1$MR+d1$UG+d1$UR,2), d1$Probe_ID),
            setNames(pmax(d2$UG,1) / pmax(d2$UG+d2$UR,2), d2$Probe_ID))
    } else {
        dG <- InfIG(sdf); dR <- InfIR(sdf); d2 <- InfII(sdf)
        betas <- c(
            setNames(pmax(dG$MG,1) / pmax(dG$MG+dG$UG,2), dG$Probe_ID),
            setNames(pmax(dR$MR,1) / pmax(dR$MR+dR$UR,2), dR$Probe_ID),
            setNames(pmax(d2$UG,1) / pmax(d2$UG+d2$UR,2), d2$Probe_ID))
    }
    
    ## always use the original order
    betas <- setNames(betas[match(sdf$Probe_ID, names(betas))], sdf$Probe_ID)
    if (mask) { betas[sdf$mask] <- NA }

    if (collapseToPfx && collapseMethod == "mean") {
        betas <- betasCollapseToPfx(betas)
    }

    betas
}

#' Get allele frequency treating type I by summing alleles
#'
#' Takes a \code{SigDF} as input and returns a numeric vector containing
#' extra allele frequencies based on Color-Channel-Switching (CCS) probes.
#' If no CCS probes exist in the \code{SigDF}, then an numeric(0) is
#' returned.
#'
#' @param sdf \code{SigDF}
#' @param known.ccs.only consider only known CCS probes
#' @return beta values
#' @examples
#' sesameDataCache() # if not done yet
#' sdf <- sesameDataGet('EPIC.1.SigDF')
#' af <- getAFTypeIbySumAlleles(sdf)
#' @export
getAFTypeIbySumAlleles <- function(sdf, known.ccs.only = TRUE) {

    stopifnot(all(c("MG","UG","MR","UR") %in% colnames(sdf)))

    dG <- InfIG(sdf); dR <- InfIR(sdf)
    af <- c(setNames(
        pmax(dG$MR+dG$UR,1)/pmax(dG$MR+dG$UR+dG$MG+dG$UG,2), dG$Probe_ID),
        setNames(
            pmax(dR$MG+dR$UG,1)/pmax(dR$MR+dR$UR+dR$MG+dR$UG,2), dR$Probe_ID))

    if (known.ccs.only) {
        af <- af[intersect(
            names(af), sesameDataGet('ethnicity.inference')$ccs.probes)]
    }
    
    af[order(names(af))] # TODO: avoid sorting
}

#' Get allele frequency
#'
#' @param sdf \code{SigDF}
#' @param ... additional options to getBetas
#' @return allele frequency
#' @examples
#' sesameDataCache() # if not done yet
#' sdf <- sesameDataGet('EPIC.1.SigDF')
#' af <- getAFs(sdf)
#' @export
getAFs <- function(sdf, ...) {
    betas <- getBetas(sdf, ...)
    c(betas[startsWith(names(betas), "rs")], getAFTypeIbySumAlleles(sdf))
}

## return NULL if failed
inferPlatformFromTango <- function(res) {
    sig <- sesameDataGet('idatSignature')
    cnts <- vapply(
        sig, function(x) sum(x %in% rownames(res$Quants)), integer(1))
    if (max(cnts) < 0.8*min(vapply(sig, length, numeric(1))) || 
        sum(cnts == max(cnts)) > 1) {
        return(NULL)
    }
    names(which.max(cnts))
}

## Import one IDAT file
## return a data frame with 2 columns, corresponding to
## cy3 (Grn) and cy5 (Red) color channel signal
readIDAT1 <- function(grn.name, red.name) {
    ida.grn <- suppressWarnings(readIDAT(grn.name))
    ida.red <- suppressWarnings(readIDAT(red.name))
    d <- cbind(
        cy3 = ida.grn$Quants[,"Mean"],
        cy5 = ida.red$Quants[,"Mean"],
        cy3n = ida.grn$Quants[,"NBeads"],
        cy5n = ida.red$Quants[,"NBeads"])
    colnames(d) <- c('G', 'R', "GN", "RN")

    ## this is not always accurate
    ## TODO should identify unique tango IDs.
    attr(d, 'platform') <- inferPlatformFromTango(ida.red)

    d
}

#' Import a pair of IDATs from one sample
#'
#' The function takes a prefix string that are shared with _Grn.idat
#' and _Red.idat. The function returns a \code{SigDF}.
#'
#' @param prefix.path sample prefix without _Grn.idat and _Red.idat
#' @param manifest optional design manifest file
#' @param min_beads minimum bead number, probes with R or G smaller than
#' this threshold will be masked. If NULL, no filtering based on bead
#' count will be applied.
#' @param controls optional control probe manifest file
#' @param verbose be verbose?  (FALSE)
#' @param platform EPIC, HM450 and HM27 etc.
#' 
#' @return a \code{SigDF}
#' 
#' @examples
#' sdf <- readIDATpair(sub('_Grn.idat','',system.file(
#'     "extdata", "4207113116_A_Grn.idat", package = "sesameData")))
#' @export
readIDATpair <- function(
    prefix.path, manifest = NULL, platform = '', min_beads = NULL,
    controls = NULL, verbose = FALSE) {
    
    if (file.exists(paste0(prefix.path, '_Grn.idat'))) {
        grn.name <- paste0(prefix.path, '_Grn.idat')
    } else if (file.exists(paste0(prefix.path, '_Grn.idat.gz'))) {
        grn.name <- paste0(prefix.path, '_Grn.idat.gz')
    } else {
        stop('Grn IDAT does not exist')
    }
    
    if (file.exists(paste0(prefix.path, '_Red.idat'))) {
        red.name <- paste0(prefix.path, '_Red.idat')
    } else if (file.exists(paste0(prefix.path, '_Red.idat.gz'))) {
        red.name <- paste0(prefix.path, '_Red.idat.gz')
    } else {
        stop('Red IDAT does not exist')
    }

    if (verbose == TRUE) {
        message("Reading IDATs for ", basename(prefix.path), "...")
    }

    dm <- readIDAT1(grn.name, red.name)
    if (platform != "") { # override inferred platform
        attr(dm, "platform") <- platform
    } else if (is.null(attr(dm, "platform"))) { # cannot infer
        if (!is.null(manifest)) {               # manifest is provided
            attr(dm, "platform") <- "custom"
        } else { # no manifest provided
            stop("Cannot infer platform. Please provide custom manifest.")
        }
    }
    
    if (is.null(manifest)) { # pre-built platforms, EPIC, HM450, HM27 etc
        df_address <- sesameDataGet(paste0(
            attr(dm, 'platform'), '.address'))
        manifest <- df_address$ordering
        controls <- df_address$controls
    }

    sdf <- sdfMsg(chipAddressToSignal(dm, manifest, min_beads), verbose,
        "IDAT platform: %s", attr(dm, "platform"))
    ## Probe IDs might not fully resolve platform
    attr(sdf, "platform") <- attr(dm, "platform")
    if (!is.null(controls) && nrow(controls) > 0) {
        attr(sdf, "controls") <- readControls(dm, controls)
    }
    sdf
}

readControls <- function(dm, controls) {
    if ("Color_Channel" %in% colnames(controls)) { # legacy control data
        ctl <- as.data.frame(dm[match(controls$Address, rownames(dm)),])
        rownames(ctl) <- make.names(controls$Name, unique=TRUE)
        ctl <- cbind(ctl, controls[, c("Color_Channel","Type")])
        colnames(ctl) <- c('G','R','col','type')
        ctl <- ctl[!(is.na(ctl$G)|is.na(ctl$R)),] # no NA in controls
    } else {
        ctl <- as.data.frame(chipAddressToSignal(dm, controls))
    }
    ctl
}

#' Identify IDATs from a directory
#'
#' The input is the directory name as a string. The function identifies all
#' the IDAT files under the directory. The function returns a vector of such
#' IDAT prefixes under the directory.
#'
#' @param dir.name the directory containing the IDAT files.
#' @param recursive search IDAT files recursively
#' @param use.basename basename of each IDAT path is used as sample name
#' This won't work in rare situation where there are duplicate IDAT files.
#' @return the IDAT prefixes (a vector of character strings).
#'
#' @examples
#' ## only search what are directly under
#' IDATprefixes <- searchIDATprefixes(
#'     system.file("extdata", "", package = "sesameData"))
#'
#' ## search files recursively is by default
#' IDATprefixes <- searchIDATprefixes(
#'     system.file(package = "sesameData"), recursive=TRUE)
#' @export
searchIDATprefixes <- function(dir.name,
    recursive = TRUE, use.basename = TRUE) {

    stopifnot(dir.exists(dir.name))

    paths <- list.files(dir.name, '\\.idat(.gz)?$', recursive = recursive)
    prefixes <- unique(sub('_(Grn|Red).idat(.gz)?', '', paths))

    df <- data.frame(
        paths=paths, 
        prefix=sub('_(Grn|Red).idat.*', '', paths),
        channel=sub('.*_(Grn|Red).idat.*', '\\1', paths))
    
    byprefix <- split(df, df$prefix)
    ## valid IDAT should has both Grn and Red
    is.valid <- vapply(byprefix, function(x) all(
        sort(x[,'channel']) == c('Grn','Red')), logical(1))
    
    prefixes <- names(is.valid)[is.valid]
    if (length(prefixes) == 0)
        stop("No IDAT file found.")

    prefixes <- file.path(dir.name, prefixes)
    
    ## set name attributes so that names are auto-assigned for
    ## lapply and mclapply
    if (use.basename) {
        names(prefixes) <- basename(prefixes)
    } else {
        names(prefixes) <- prefixes
    }
    prefixes
}

#' Lookup address in one sample
#'
#' Lookup address and transform address to probe
#'
#' Translate data in chip address to probe address.
#' Type I probes can be separated into Red and Grn channels. The
#' methylated allele and unmethylated allele are at different
#' addresses. For type II probes methylation allele and unmethylated allele are
#' at the same address. Grn channel is for methylated allele and Red channel is
#' for unmethylated allele. The out-of-band signals are type I probes measured
#' using the other channel.
#'
#' @param dm data frame in chip address, 2 columns: cy3/Grn and cy5/Red
#' @param mft a data frame with columns Probe_ID, M, U and col
#' @param min_beads minimum bead counts, otherwise masked
#' @return a SigDF, indexed by probe ID address
chipAddressToSignal <- function(dm, mft, min_beads = NULL) {

    ## Infinium-I
    mft1 <- mft[!is.na(mft$col),]
    tmpM <- dm[match(mft1$M, rownames(dm)),]
    tmpU <- dm[match(mft1$U, rownames(dm)),]
    sdf <- data.frame(
        Probe_ID=mft1$Probe_ID,
        MG=unname(tmpM[,"G"]),
        MR=unname(tmpM[,"R"]),
        UG=unname(tmpU[,"G"]),
        UR=unname(tmpU[,"R"]),
        col=mft1$col, mask=FALSE)

    if (!is.null(min_beads)) {
        sdf$mask <- (
            is.na(tmpM[,"GN"]) | is.na(tmpM[,"RN"]) |
            is.na(tmpU[,"GN"]) | is.na(tmpU[,"RN"]) |
            tmpM[,"GN"] < min_beads | tmpM[,"RN"] < min_beads |
            tmpU[,"GN"] < min_beads | tmpU[,"RN"] < min_beads)
    }

    ## Infinium-II
    mft2 <- mft[is.na(mft$col),]
    if (nrow(mft2) > 0) {
        tmp <- dm[match(mft2$U, rownames(dm)),]
        s2 <- data.frame(
            Probe_ID=mft2$Probe_ID,
            MG=NA, MR=NA,
            UG=unname(tmp[,"G"]),
            UR=unname(tmp[,"R"]),
            col="2", mask=FALSE)

        if (!is.null(min_beads)) {
            s2$mask <- (
                is.na(tmp[,"GN"]) | is.na(tmp[,"RN"]) |
                tmp[,"GN"] < min_beads | tmp[,"RN"] < min_beads)
        }
        sdf <- rbind(sdf, s2)
    }
    sdf$col <- factor(sdf$col, levels=c("G","R","2"))
    sdf <- sdf[match(mft$Probe_ID, sdf$Probe_ID),] # always the mft order
    sdf <- structure(sdf, class=c("SigDF", "data.frame"))
    rownames(sdf) <- NULL
    sdf
}

SigSetToSigDF <- function(sset) {
    df <- rbind(
        data.frame(
            Probe_ID = rownames(sset@IG),
            MG = sset@IG[,"M"], MR = sset@oobR[,"M"],
            UG = sset@IG[,"U"], UR = sset@oobR[,"U"], col="G", mask=FALSE),
        data.frame(
            Probe_ID = rownames(sset@IR),
            MG = sset@oobG[,"M"], MR = sset@IR[,"M"],
            UG = sset@oobG[,"U"], UR = sset@IR[,"U"], col="R", mask=FALSE),
        data.frame(
            Probe_ID = rownames(sset@II),
            MG = NA, MR = NA,
            UG = sset@II[,"M"], UR = sset@II[,"U"], col="2", mask=FALSE))
    sdf <- structure(df, class=c("SigDF", "data.frame"))
    sdf$col <- factor(sdf$col, levels=c("G","R","2"))
    attr(sdf, "platform") <- sset@platform
    attr(sdf, "controls") <- sset@ctl
    rownames(sdf) <- NULL
    sdf
}

#' Compute internal bisulfite conversion control
#'
#' Compute GCT score for internal bisulfite conversion control. The function
#' takes a \code{SigSet} as input. The higher the GCT score, the more likely
#' the incomplete conversion.
#'
#' @param sdf a SigDF
#' @param extR a vector of probe IDs for Infinium-I probes that extend to
#' converted A
#' @param extA a vector of probe IDs for Infinium-I probes that extend to
#' original A
#' @param verbose print more messages
#' @return GCT score (the higher, the more incomplete conversion)
#' @examples
#' sesameDataCache() # if not done yet
#' sdf <- sesameDataGet('EPIC.1.SigDF')
#' bisConversionControl(sdf)
#'
#' ## For more recent platforms like EPICv2, MSA:
#' ## One need extR and extA of other arrays using the sesameAnno
#' \dontrun{
#' mft = sesameAnno_buildManifestGRanges(sprintf(
#'   "%s/EPICv2/EPICv2.hg38.manifest.tsv.gz",
#'   "https://github.com/zhou-lab/InfiniumAnnotationV1/raw/main/Anno/"),
#'   columns="nextBase")
#' extR = names(mft)[!is.na(mft$nextBase) & mft$nextBase=="R"]
#' extA = names(mft)[!is.na(mft$nextBase) & mft$nextBase=="A"]
#' }
#'
#' @export
bisConversionControl <- function(sdf, extR=NULL, extA=NULL, verbose = FALSE) {

    platform <- sdfPlatform(sdf, verbose = verbose)
    if (platform %in% c('EPICplus','EPIC','HM450')) {
        extR <- sesameDataGet(paste0(platform, '.probeInfo'))$typeI.extC
        extA <- sesameDataGet(paste0(platform, '.probeInfo'))$typeI.extT
    }
    stopifnot(!is.null(extR) && !is.null(extA))
    df <- InfIR(sdf)
    extR <- intersect(df$Probe_ID, extR)
    extA <- intersect(df$Probe_ID, extA)
    dR <- df[match(extR, df$Probe_ID),]
    dA <- df[match(extA, df$Probe_ID),]
    mean(c(dR$MG, dR$UG), na.rm=TRUE) / mean(c(dA$MG, dA$UG), na.rm=TRUE)
}

## retired functions:
## after 1.11
## bisConversionControl, SigSet class, updateSigSet, subsetSignal,
## makeExampleSeSAMeDataSet, makeExampleTinyEPICDataSet, saveMask, 
## restoreMask, buildControlMatrix450k, detectionPoobEcdf2,
## detectionPnegNormTotal, detectionPnegNormGS, detectionPfixedNorm,
## detectionPnegNorm, noobsb, IG2,IGR2,oobG2, oobR2ļ¼ŒSigSetMethod,
## SigSetList, SigSetsToRGChannelSet, RGChannelSetToSigSets,
## SigSetToRatioSet, parseGEOSignalABFile
## after 1.5.0
## R6 utility functions
zwdzwd/sesame documentation built on Jan. 8, 2025, 4:50 a.m.