R/QC.R
In DiscoRhythm: Interactive Workflow for Discovering Rhythmicity in Biological Data

Documented in discoInterCorOutliers discoPCA discoPCAgetOutliers discoPCAoutliers discoRepAnalysis

#' Quality Control for DiscoRhythm
#'
#' Functions for executing outlier detection and row filtering procedures
#' prior to rhythmicity analysis.
#'
#' @name discoQC
NULL

#' Perform PCA
#'
#' Calculates PCA results from \code{prcomp} with error handling and outputs
#' suitable for the DiscoRhythm workflow.
#'
#' @param npcs numeric, maximum number of principal components to return.
#'
#' @inheritParams discoPCAoutliers
#'
#' @return output from \link[stats]{prcomp} with an added table summary
#'
#' @export
#'
#' @examples
#' se <- discoGetSimu(TRUE)
#' pca <- discoPCA(se)
#'
discoPCA <- function(se, scale = TRUE, npcs = 10) {
    # Unit checks
    foo <- dim(se)
    if (foo[2] < 2) {
        stop("Not enough samples for PCA.")
    }
    if (foo[1] < 2) {
        stop("Not enough rows for PCA.")
    }
    if (!methods::is(se, "SummarizedExperiment")) {
        stop("Input must be a SummarizedExperiment.")
    }
    if (any(c(!is.numeric(npcs), length(npcs) != 1, npcs <= 0))) {
        stop("npcs should be a single positive numeric value.")
    }


    dat <- assay(se)
    
    res <- stats::prcomp(t(dat),
                center = TRUE,
                scale. = scale,
                rank. = npcs
                )

    npcs <- min(npcs, ncol(res$x))
    x <- t(summary(res)$importance[, seq_len(npcs)])
    x <- data.frame(PC = rownames(x), x)
    res$table <- x
    rownames(res$x) <- se$ID
    
    return(res)
}

#' Internal function for applying SD cutoff to PCA results
#' Returns a logical indicating which samples are not outliers
#' @keywords internal
#' @return logical indicating which samples are outliers in PCA
discoPCAgetOutliers <- function(x, SDfactor = 3, pcToCut = seq_len(4)) {
    sdVec <- matrixStats::rowSds(t(x[, pcToCut, drop = FALSE])) * SDfactor
    meanVec <- colMeans(x[, pcToCut, drop = FALSE])
    meanMat <- matrix(rep(meanVec, nrow(x)), nrow = nrow(x), byrow = TRUE)
    sdMat <- matrix(rep(sdVec, nrow(x)), nrow = nrow(x), byrow = TRUE)
    isoutlier <- apply(abs(x[, pcToCut,
                            drop = FALSE] - meanMat) > sdMat, 1, any)
    return(!isoutlier)
}

#' @rdname discoQC
#'
#' @inheritParams discoInterCorOutliers
#' @param scale logical, whether or not to scale the data prior to PCA, see
#' \link[stats]{prcomp} for more details.
#' @param pcToCut numeric, which PCs to use for outlier detection. It is
#' recomended to select the first X PCs based on which PCs explain a
#' significant amount of variance in the data.
#'
#' @return list containing PCA results and the detected outliers
#'
#' @export
#'
#' @examples
#' se <- discoGetSimu(TRUE)
#' PCAres <- discoPCAoutliers(se)
#'
discoPCAoutliers <- function(se, threshold = 3,
    scale = TRUE, pcToCut = seq_len(4)) {
    if(!methods::is(se,"SummarizedExperiment")){
        stop("Input must be a SummarizedExperiment.")
    }
    
    res <- discoPCA(se, scale = scale)
    res$outliers <- !discoPCAgetOutliers(res$x, threshold, pcToCut)
    res
}

#' @rdname discoQC
#'
#' @param se SummarizedExperiment, the main data object used by DiscoRhythm 
#' expected to contain se$ID, se$ReplicateID, se$Time sample metadata and
#' non-null rownames. See the vignette for more details.
#' @param cor_method character, method of pairwise correlation
#' (see \link[stats]{cor}'s "method" argument for all options).
#' @param threshold numeric, a threshold determining which samples are
#' outliers (for discoInterCorOutliers, in units of thresh_type, for
#' discoPCAoutliers in units of standard deviations).
#' @param thresh_type character indicating threshold type (either standard
#' deviations below the mean, or an absolution correlation value). One of:
#' "sd" or "value".
#'
#' @return A list of 3 objects:
#' 1) outliers - named logical indicating if the sample is an outlier
#' 2) meanCor - mean of all pairwise correlations for a given sample
#' 3) corMat - Matrix of all pairwise correlation values
#'
#' @export
#'
#' @examples
#' CorRes <- discoInterCorOutliers(se)
#'
discoInterCorOutliers <- function(se,
                                    cor_method = c("pearson",
                                                    "kendall",
                                                    "spearman"),
                                    threshold = 3,
                                    thresh_type = c("sd","value")) {

    cor_method = match.arg(cor_method)
    thresh_type = match.arg(thresh_type)
    
    if(!methods::is(se,"SummarizedExperiment")){
        stop("Input must be a SummarizedExperiment.")
    }
    
    dat <- assay(se)

    mat <- stats::cor(dat, method = cor_method)

    # Remove Diagonal (all 1's) to avoid inflated correlation
    corMat <- mat
    diag(corMat) <- NA

    meanCor <- rowMeans(corMat, na.rm = TRUE)

    if (thresh_type == "sd") {
        cutval <- (mean(meanCor) - threshold * stats::sd(meanCor))
    } else if (thresh_type == "value") {
        cutval <- threshold
    }
    outliers <- meanCor <= cutval

    res <- list(outliers = outliers,
                meanCor = meanCor,
                corMat = corMat, 
                threshold = cutval)
    res
}

################## Row Replicate Analysis #######################

# Average technical replicates
#' @keywords internal
averageTech <- function(se, method = c("Median","Mean","Random","None")) {
    
    method <- match.arg(method)
    
    if(!methods::is(se,"SummarizedExperiment")){
        stop("Input must be a SummarizedExperiment.")
    }
    
    bioID <- paste0(se$Time,"_",se$ReplicateID)
    mat <- assay(se)
    
    # Data matrix
    res <- lapply(unique(bioID), function(I) {
        sidx <- which(bioID == I)
        if (length(sidx) >= 2) {
        if (method == "Median") {
            tmp <- as.data.frame(matrixStats::rowMedians(mat[, sidx]))
        } else if (method == "Mean") {
            tmp <- as.data.frame(matrixStats::rowMeans2(mat[, sidx]))
            } else if (method == "Random") {
                tmp <- as.data.frame((mat[, sample(sidx, 1)]))
            } else if (method == "None") {
                tmp <- as.data.frame(mat[, sidx])
            }
        } else {
            tmp <- as.data.frame(mat[, sidx])
        }
        
        # Following the original input data naming scheme:
        # <prefix><time>_<unique_id>_<replicate_id>
        # so output can be used as shiny input
        if (method != "None") {
            colnames(tmp) <- paste(se$Time[sidx - 1], "A",
                se$ReplicateID[sidx - 1], sep = "_")[1]
        } else {
            colnames(tmp) <- paste(se$Time[sidx - 1], "A",
                se$ReplicateID[sidx - 1], sep = "_")
        }
        tmp
    })

    res2 <- do.call(data.frame, res)
    regDat <- data.frame("ID" = as.character(rownames(se)), res2)

    # Metadata
    res <- lapply(unique(bioID), function(I) {
        sidx <- which(bioID == I)
        if (length(sidx) >= 2) {
            if (method != "None") {
                tmp <- colData(se)[sidx, ][1, ]
            } else {
                tmp <- colData(se)[sidx, ]
            }
        } else {
            tmp <- colData(se)[sidx, ]
        }
        tmp
    })

    res2 <- do.call(rbind, res)
    regMet <- res2[, c("Time","ReplicateID")]

    outse <- discoDFtoSE(regDat,regMet)
    
    return(list("se" = outse))
}

#' @rdname discoQC
#'
#' @inheritParams discoBatch
#' @inheritParams discoInterCorOutliers
#'
#' @export
#'
#' @examples
#' ANOVAres <- discoRepAnalysis(se)
#'
discoRepAnalysis <- function(se,
                                aov_method = c("Equal Variance","Welch","None"),
                                aov_pcut = 0.05,
                                aov_Fcut = 0,
                                avg_method = c("Median","Mean","Random","None")
) {
    
    aov_method=match.arg(aov_method)
    avg_method=match.arg(avg_method)
    
    if(!methods::is(se,"SummarizedExperiment")){
        stop("Input must be a SummarizedExperiment.")
    }
    
    Metadata <- colData(se)
    mat <- assay(se)
    
    bioID <- paste0(se$Time,"_",se$ReplicateID)

    noAOV <- !(aov_method %in% c("Welch","Equal Variance"))
    norep <- !any(duplicated(bioID)) & !noAOV
    if(norep) warning("No replicates present for discoRepAnalysis,
        can't perform ANOVA")
    # Return the same for aov_method="None" and for
    # the case of no replicates
    if(norep | noAOV){
        nr <- nrow(se)
        allStats <- list(statistic = rep(100, nr),
            pvalue = rep(0, nrow(se)),
            df.between = rep(0, nr),
            df.within = rep(0, nr),
            estimate = rep(0, nr))
    } else if (aov_method == "Welch") {
        allStats <- matrixTests::row_oneway_welch(mat, bioID)
    } else if (aov_method == "Equal Variance") {
        allStats <- matrixTests::row_oneway_equalvar(mat, bioID)
    }
    idx <- (allStats$pvalue <= aov_pcut) & (allStats$statistic > aov_Fcut)
    res <- averageTech(se[idx, ], avg_method)
    res$aovP <- allStats$pvalue
    res$allStats <- allStats
    
    return(res)
}

Any scripts or data that you put into this service are public.

DiscoRhythm documentation built on Nov. 8, 2020, 7:32 p.m.

rdrr.io home R language documentation Run R code online

CRAN packages Bioconductor packages R-Forge packages GitHub packages

Note that we can't provide technical support on individual packages. You should contact the package authors for that.

DiscoRhythm
Interactive Workflow for Discovering Rhythmicity in Biological Data

R/QC.R
In DiscoRhythm: Interactive Workflow for Discovering Rhythmicity in Biological Data

Defines functions discoRepAnalysis averageTech discoInterCorOutliers discoPCAoutliers discoPCAgetOutliers discoPCA

Documented in discoInterCorOutliers discoPCA discoPCAgetOutliers discoPCAoutliers discoRepAnalysis

Try the DiscoRhythm package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

DiscoRhythm Interactive Workflow for Discovering Rhythmicity in Biological Data

R/QC.R In DiscoRhythm: Interactive Workflow for Discovering Rhythmicity in Biological Data

Defines functions discoRepAnalysis averageTech discoInterCorOutliers discoPCAoutliers discoPCAgetOutliers discoPCA

Documented in discoInterCorOutliers discoPCA discoPCAgetOutliers discoPCAoutliers discoRepAnalysis

Try the DiscoRhythm package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

DiscoRhythm
Interactive Workflow for Discovering Rhythmicity in Biological Data

R/QC.R
In DiscoRhythm: Interactive Workflow for Discovering Rhythmicity in Biological Data