R/PCAMax.R
In trena: Fit transcriptional regulatory networks using gene expression, priors, machine learning

Documented in PCAMax

#' Class PCAMax
#'
#' @import methods
#' @import utils
#'
#' @name PCAMax
#'
.pcaMax <- setClass("PCAMax",
                    slots=c(tbl="data.frame",
                            solverNames="character",
                            state="environment")
                    )

#------------------------------------------------------------------------------------------------------------------------
setGeneric("normalizeModel",   signature="obj", function(obj, normalizing.max=10)
            standardGeneric ("normalizeModel"))
setGeneric("addStats",         signature="obj", function(obj, varianceToInclude=0.75, scalePCA=FALSE) standardGeneric ("addStats"))
setGeneric("addStatsSimple",   signature="obj", function(obj, varianceToInclude=0.75, scalePCA=FALSE, excludeLasso=TRUE, quiet=TRUE) standardGeneric ("addStatsSimple"))
setGeneric("getCoverage",      signature="obj", function(obj) standardGeneric ("getCoverage"))
#------------------------------------------------------------------------------------------------------------------------
#' Create a PCAMax object from a data.frame produced by the EnsembleSolver
#'
#' @param tbl a data.frame
#' @param tfIdentifierColumnName a character string identifying the transcription factor column
#'
#' @return a PCAMax object
#'
#' @seealso  \code{\link{EnsembleSolver}}
#'
#' @export
#'
PCAMax <- function(tbl, tfIdentifierColumnName="tf.hgnc")
{
    stopifnot(tfIdentifierColumnName %in% colnames(tbl))
    standardSolverNames <- c("betaLasso", "lassoPValue", "pearsonCoeff", "rfScore",
                             "betaRidge", "spearmanCoeff")

    coi <- intersect(standardSolverNames, colnames(tbl))
    if(length(coi) < 3)
      stop(sprintf("insufficient dimensions,  %d in EnsembleSolver data.frame to calculate pcaMax", length(coi)))

    tbl.trimmed <- tbl[, coi]
    rownames(tbl.trimmed) <- tbl[, tfIdentifierColumnName]
    state <- new.env(parent=emptyenv())
    state$coverage <- 0
    state$tbl.orig <- tbl
    .pcaMax(tbl=tbl.trimmed, solverNames=standardSolverNames, state=state)

} # PCAMax ctor
#------------------------------------------------------------------------------------------------------------------------
#' transform a specific column to fit normal distribution
#'
#' @rdname normalizeModel
#' @aliases normalizeModel
#'
#' @param obj An object of the class PCAMax
#' @param normalizing.max numeric, a maximum value for the normalized distrubtions
#'
#' @return a normalized matrix, each column treated separately
#'
#' @export
#'
setMethod('normalizeModel', 'PCAMax',

     function(obj, normalizing.max=10){

        mtx <- as.matrix(obj@tbl)
        for(colname in colnames(mtx)){
           vec <- as.numeric(mtx[, colname])
           if(colname %in% c("betaLasso", "betaRidge"))
              vec.norm <- .normalize_betaValues(vec, normalizing.max)
           if(colname == "lassoPValue")
              vec.norm <- .normalize_pval(vec, normalizing.max)
           if(colname == "rfScore")
              vec.norm <- .normalize_randomForest(vec, normalizing.max)
           if(colname %in% c("spearmanCoeff", "pearsonCoeff"))
              vec.norm <- .normalize_correlationValues(vec, normalizing.max)
           mtx[, colname] <- vec.norm
           } # for colname
        obj@state$normalizedMatrix <- mtx
        mtx
        })

#------------------------------------------------------------------------------------------------------------------------
#' add PCA-based summary stats on all TFs in the model
#'
#' @rdname addStats
#' @aliases addStats
#'
#' @param obj An object of the class PCAMax
#' @param varianceToInclude numeric variance to include in the PCA
#' @param scalePCA logical
#'
#' @return  the original model with extra columns: pcaMax, cov, PC1, PC2,
#'
#' @export
#'
setMethod('addStats', 'PCAMax',

     function(obj, varianceToInclude=0.75, scalePCA=FALSE){
        stopifnot(varianceToInclude > 0)
        mtx <- obj@state$normalizedMatrix
           # try removing lassoPValue
        col.rm <- grep("lassoPValue", colnames(mtx))
        if(length(col.rm) > 0)
           mtx <- mtx[, -col.rm]
        pca <- prcomp(mtx, scale=scalePCA)
            #
        if(varianceToInclude > 0.95)
           numberOfComponents <- length(pca$sdev)
        numberOfComponents <- which(cumsum(pca$sdev/sum(pca$sdev) ) > varianceToInclude)[1]
        obj@state$pca <- pca
        mtx.pca <- pca$x
         # coverage: the variance covered by the first two components
        obj@state$coverage <- sum(pca$sdev[1:2]) / sum(pca$sdev)
        mean <- apply(mtx, 1, function(row) mean(abs(row)))
        mtx.2 <- cbind(mtx, mean)
        rms <- apply(mtx, 1, function(row) sqrt(mean(row * row)))
        calc.pc.vector.length <- function(row, numberOfComponents){
           sum.of.squares <- 0
           for(i in seq_len(numberOfComponents)){
              new.sum <- row[i] * row[i]
              sum.of.squares <- sum.of.squares + new.sum
              }
           sqrt(sum.of.squares/numberOfComponents)
           }
        pcaMax <- as.numeric(apply(mtx.pca, 1, function(row) calc.pc.vector.length(row, numberOfComponents)))
        #mtx.3 <- cbind(mtx.2, rms)
        #mtx.4 <- cbind(mtx.3, mtx.pca[, 1:2])
        #mtx.5 <- cbind(mtx.4, pcaMax)
          #  use coefficient of variation to desribe the dispersion (agreement) of the data
        covar <- apply(mtx, 1, function(row) sd(abs(row))/mean(abs(row)))
        #mtx.6 <- cbind(mtx.5, covar)
        #mtx.7 <- mtx.6[order(mtx.6[, "pcaMax"], decreasing=TRUE),]
        #obj@state$normalizedMatrixWithStats <- mtx.7
        tbl.out <- obj@state$tbl.orig
        existing.pcaMax.column <- grep("pcaMax", colnames(tbl.out))
        if(length(existing.pcaMax.column) > 0)
           tbl.out <- tbl.out[, -existing.pcaMax.column]
        tbl.out <- cbind(tbl.out, pcaMax)
        tbl.out$covar <- covar
        tbl.out <- tbl.out[order(tbl.out$pcaMax, decreasing=TRUE),]
        tbl.out$rank <- seq_len(nrow(tbl.out))

        tbl.out
        })

#------------------------------------------------------------------------------------------------------------------------
#' add PCA-based summary stats on all TFs in the model
#'
#' @rdname addStatsSimple
#' @aliases addStatsSimple
#'
#' @param obj An object of the class PCAMax
#' @param varianceToInclude numeric variance to include in the PCA
#' @param scalePCA logical
#' @param excludeLasso logical excluding lasso avoids dropping TFs due to shrinkage
#' @param quiet logical
#'
#' @return  the original model with extra columns: pcaMax, cov, PC1, PC2,
#'
#' @export
#'
setMethod('addStatsSimple', 'PCAMax',

     function(obj, varianceToInclude=0.75, scalePCA=FALSE, excludeLasso=TRUE, quiet=TRUE){
        stopifnot(varianceToInclude > 0)
        mtx <- obj@state$normalizedMatrix
           # try removing lassoPValue
        if(excludeLasso){
           col.rm <- grep("lasso", colnames(mtx), ignore.case=TRUE)
           printf("exluding %d lasso-related columns", length(col.rm))
           if(length(col.rm) > 0)
              mtx <- mtx[, -col.rm]
           } # if excludeLasso
        scoreRow <- function(row){
           zeros <- which(abs(row) < 1e-10)
           if(length(zeros) > 0){
              #printf("reducing row by %d", length(zeros))
              row <- row[-zeros]
              }
           sum(abs(row))/length(row)
           }
        score <- round(apply(mtx, 1, scoreRow), digits=2)
        if(!quiet){
           print(mtx)
           print(score)
           }

        score.norm <- round(100 * (score/max(score)))
        tbl.out <- cbind(obj@state$tbl.orig, score.norm)
        tbl.out <- cbind(tbl.out, score)
        tbl.out <- tbl.out[order(tbl.out$score, decreasing=TRUE),]
        tbl.out$rank <- seq_len(nrow(tbl.out))
        tbl.out
        })

#------------------------------------------------------------------------------------------------------------------------
#' what percentage of the variance is captured in the first two principal components?
#'
#' @rdname getCoverage
#' @aliases getCoverage
#'
#' @param obj An object of the class PCAMax
#'
#' @return a number between zero and one
#'
#' @export
#'
setMethod('getCoverage', 'PCAMax',

     function(obj){
        obj@state$coverage
     })

#------------------------------------------------------------------------------------------------------------------------
.normalize_randomForest <- function(vector, normalizing.max=10)
{
   third.quartile.normalizing.max <- 0.75 * normalizing.max
   third.quartile <- fivenum(vector)[4]
   actual.max <- third.quartile
   scale <- third.quartile.normalizing.max/actual.max
   vector * scale

} # .normalize_randomForest
#------------------------------------------------------------------------------------------------------------------------
.normalize_pval <- function(vector, normalizing.max=10)
{
   stopifnot(all(vector >= 0))
   # x <- -log10(vector + temper.pvalue.amount) #   ensure no zero values, suppress large numbers
   x <- -log10(vector +  .Machine$double.xmin)  #   ensure no zero values
   actual.max <- max(x)
   scale <- normalizing.max/actual.max
   x * scale

} # .normalize_pval
#------------------------------------------------------------------------------------------------------------------------
.normalize_betaValues <- function(vector, normalizing.max=10)
{
   actual.max <- max(abs(vector))
   scale <- normalizing.max/actual.max
   vec.new <- vector * scale
   vec.new

} # .normalize_betaValues
#------------------------------------------------------------------------------------------------------------------------
.normalize_correlationValues <- function(vector, normalizing.max=10)
{
   actual.max <- 1
   scale <- normalizing.max/actual.max
   vector * scale

} # .normalize_correlationValues
#------------------------------------------------------------------------------------------------------------------------
#              # Replace missing values and scale the data
#              # Use the *.med and *.scale values to center/scale everything
#              tbl.all[is.na(tbl.all)] <- 0
#              tbl.scale <- tbl.all[-1]
#
#              if(
# "lassoPValue" %in% names(tbl.scale)){
#                  tbl.scale$lassoPValue <- -log10(tbl.scale$lassoPValue)
#                  tbl.scale$lassoPValue <- scale(tbl.scale$lassoPValue,
#                                                   center = lassopv.med,
#                                                   scale = lassopv.scale)
#              }
#
#              if("betaLasso" %in% names(tbl.scale)){
#                  tbl.scale$betaLasso <- scale(tbl.scale$betaLasso,
#                                                center = lasso.med,
#                                                scale = lasso.scale)
#              }
#
#              if("betaRidge" %in% names(tbl.scale)){
#                  tbl.scale$betaRidge <- scale(tbl.scale$betaRidge,
#                                                center = ridge.med,
#                                                scale = ridge.scale)
#              }
#
#              if("pearsonCoeff" %in% names(tbl.scale)){
#                  tbl.scale$pearsonCoeff <- scale(tbl.scale$pearsonCoeff,
#                                                   center = pearson.med,
#                                                   scale = pearson.scale)
#              }
#
#              if("spearmanCoeff" %in% names(tbl.scale)){
#                  tbl.scale$spearmanCoeff <- scale(tbl.scale$spearmanCoeff,
#                                                    center = spearman.med,
#                                                    scale = spearman.scale)
#              }
#
#
#              if("bayesScore" %in% names(tbl.scale)){
#                  tbl.scale$bayesScore <- scale(tbl.scale$bayesScore,
#                                             center = bayesspike.med,
#                                             scale = bayesspike.scale)
#              }
#
#              if("rfScore" %in% names(tbl.scale)){
#                  tbl.scale$rfScore <- scale(tbl.scale$rfScore,
#                                              center = randomforest.med,
#                                              scale = randomforest.scale)
#              }
#
#              rownames(tbl.scale) <- tbl.all$gene
#
#              tbl.augmented <- try(.addEnsembleScores(tbl.scale, tbl.all), silent = TRUE)
#
##------------------------------------------------------------------------------------------------------------------------
#.addEnsembleScores <- function(tbl.scale, tbl.all) {
#
#    # Compute the scaled "concordance score"
#    pca <- stats::prcomp(tbl.scale, center=FALSE, scale.=FALSE)
#
#    pca$x <- pca$x / sqrt(length(which(pca$sdev > 0.1)))
#    concordance <- apply(pca$x[, pca$sdev > 0.1, drop=FALSE], 1,
#                         function(x) {sqrt(mean((2*atan(x)/pi)^2))})
#    concordance <- as.data.frame(concordance)
#    concordance$gene <- rownames(concordance)
#    rownames(concordance) <- NULL
#    tbl.all <- merge(tbl.all, concordance, by = "gene", all = TRUE)
#
#    # Transform via PCA and compute the pcaMax score
#    pcaMax <- apply(pca$x[, pca$sdev > 0.1, drop=FALSE],1, function(x) {sqrt(mean(x*x))})
#    pcaMax <- as.data.frame(pcaMax)
#    pcaMax$gene <- rownames(pcaMax)
#    rownames(pcaMax) <- NULL
#    tbl.all <- merge(tbl.all, pcaMax, by = "gene", all = TRUE)
#
#    # Sort by pcaMax
#    tbl.all <- tbl.all[order(tbl.all$pcaMax, decreasing = TRUE),]
#
#    return(tbl.all)
#}
##------------------------------------------------------------------------------------------------------------------------
##----------------------------------------------------------------------------------------------------

Any scripts or data that you put into this service are public.

trena documentation built on Nov. 15, 2020, 2:07 a.m.

rdrr.io home R language documentation Run R code online

CRAN packages Bioconductor packages R-Forge packages GitHub packages

Note that we can't provide technical support on individual packages. You should contact the package authors for that.

trena
Fit transcriptional regulatory networks using gene expression, priors, machine learning

R/PCAMax.R
In trena: Fit transcriptional regulatory networks using gene expression, priors, machine learning

Defines functions .normalize_correlationValues .normalize_betaValues .normalize_pval .normalize_randomForest PCAMax

Documented in PCAMax

Try the trena package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

trena Fit transcriptional regulatory networks using gene expression, priors, machine learning

R/PCAMax.R In trena: Fit transcriptional regulatory networks using gene expression, priors, machine learning

Defines functions .normalize_correlationValues .normalize_betaValues .normalize_pval .normalize_randomForest PCAMax

Documented in PCAMax

Try the trena package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

trena
Fit transcriptional regulatory networks using gene expression, priors, machine learning

R/PCAMax.R
In trena: Fit transcriptional regulatory networks using gene expression, priors, machine learning