R/pm_getSignature.R
In HiLDA: Conducting statistical inference on comparing the mutational exposures of mutational signatures by using hierarchical latent Dirichlet allocation

Documented in calcPMSLikelihood mySquareEM pmgetSignature PMSboundary updatePMSParam

#' Obtain the parameters for mutation signatures and memberships
#' 
#' @param mutationFeatureData the mutation data (MutationFeatureData class 
#'        (S4 class)) by the \code{hildaReadMPFile}. 
#' @param K the number of mutation signatures
#' @param numInit the number of performing calculations with different initial 
#'        values
#' @param tol tolerance for the estimation
#'        (when the difference of log-likelihoods become below this value, stop 
#'        the estimation)
#' @param maxIter the maximum number of iteration of estimation
#' 
#' @return The output is an instance of EstimatedParameters S4 class, which 
#'         stores estimated parameters and other meta-information, and will be 
#'         used for saving parameter values and visualizing the mutation 
#'         signatures and memberships
#' 
#' @examples 
#' ## After obtaining G (see e.g., hildaReadMPFile function)
#' load(system.file("extdata/sample.rdata", package="HiLDA"))
#' Param <- pmgetSignature(G, K = 3)
#' 
#'
#' @useDynLib HiLDA
#' @importFrom Rcpp sourceCpp
#' @importFrom stats rgamma
#' @importFrom methods slot is
#' @export
pmgetSignature <- function(mutationFeatureData, K, numInit = 10, tol = 1e-4, 
                           maxIter = 10000) {
  
    if (is(mutationFeatureData)[1] != "MutationFeatureData") {
      stop("This inputfile is not an MutationFeatureData object.")
    }
  
    if (K <= 1) {
      stop("Please enter an integer number greater than 1.")
    }
  
    isBG <- FALSE
    varK <- K
    BG <- 0

    sampleNum <- length(slot(mutationFeatureData, "sampleList"))
    fdim <- slot(mutationFeatureData, "possibleFeatures")
  
    tempL <- -Inf
    tempPar <- c()
    for (kkk in seq_len(numInit)) {
      
      F <- array(0, c(varK, length(fdim), max(fdim)))
      
      for (k in seq_len(varK)) {
        for (kk in seq_len(length(fdim))) {
          F[k,kk,seq_len(fdim[kk])] <- stats::rgamma(fdim[kk], rep(1, fdim[kk]))
          F[k,kk,seq_len(fdim[kk])] <- 
            F[k,kk,seq_len(fdim[kk])] / sum(F[k,kk,seq_len(fdim[kk])])
        }
      }
      
  
      Q <- matrix(rgamma(sampleNum * K, 1, 1), K, sampleNum)
      Q <- sweep(Q, 2, apply(Q, 2, sum), `/`)
      
      p0 <- c(convertToTurbo_F(as.vector(F), fdim, K, isBG), 
              convertToTurbo_Q(as.vector(t(Q)), K, sampleNum))
      Y <- list(list(sampleNum, fdim, slot(mutationFeatureData, 
                                           "featureVectorList"), 
                     slot(mutationFeatureData, "countData")), K, isBG, BG)
      
      res1 <- mySquareEM(p0, Y, tol = tol, maxIter = maxIter)
      cat(paste("#trial: ", sprintf("%2d", kkk), 
                "; #iteration: ", sprintf("%4d", as.integer(res1$itr)), 
                "; time(s): ", sprintf("%4.2f", res1$elapsed.time), 
                "; convergence: ", res1$convergence,
                "; loglikelihood: ", 
                sprintf("%.4f", res1$value.objfn), "\n", sep=""
      ))
      
      if (res1$value.objfn > tempL) {
        tempL <- res1$value.objfn
        tempPar <- res1$par
      }
      
    }
    
    lenF <- varK * (sum(fdim) - length(fdim))
    lenQ <- sampleNum * (K - 1)
    F <- convertFromTurbo_F(tempPar[seq_len(lenF)], fdim, K, isBG)
    Q <- convertFromTurbo_Q(tempPar[(lenF + 1):(lenF + lenQ)], K, sampleNum)
    dim(F) <- c(varK, length(fdim), max(fdim))
    dim(Q) <- c(sampleNum, K)
    
    # return(list(F, Q, tempL))
    return(new(Class = "EstimatedParameters", 
               type = slot(mutationFeatureData, "type"),
               flankingBasesNum = slot(mutationFeatureData, "flankingBasesNum"),
               transcriptionDirection = slot(mutationFeatureData, 
                                             "transcriptionDirection"),
               possibleFeatures = slot(mutationFeatureData, "possibleFeatures"),
               sampleList = slot(mutationFeatureData, "sampleList"),
               signatureNum = as.integer(K),
               isBackGround = isBG,
               backGroundProb = BG,
               signatureFeatureDistribution = F,
               sampleSignatureDistribution = Q,
               loglikelihood = tempL)
    )
 
}




#' A function for estimating parameters using Squared EM algorithm
#' 
#' @param p this variable includes the parameters for mutation signatures and 
#'          membership parameters
#' @param y this variable includes the information on the mutation features, 
#' the number of mutation signatures specified and so on
#' @param tol tolerance for the estimation
#'            (when the difference of log-likelihoods become below this value, 
#'            stop the estimation)
#' @param maxIter the maximum number of iteration of estimation
#' @return a list 
mySquareEM <- function(p, y, tol = 1e-4, maxIter = 10000) {
  
    prevL <- -Inf
    step.min <- 1
    step.max <- 1
    step.max0 <- 1
    mstep <- 4
    objfn.inc <- 1
    updEvalNum <- 0
    LEvalNum <- 0
    useSquareEM <- 0
    iterNum <- 0
    convFlag <- FALSE
    startTime <- proc.time()
    
    newL <- calcPMSLikelihood(p, y)
    LEvalNum <- LEvalNum + 1
    
    for (iterNum in seq_len(maxIter)) {
      
      p1 <- updatePMSParam(p, y)
      updEvalNum <- updEvalNum + 1
      if ( any(is.nan(unlist(p1))) ) {
        stop("Error in function evaluation")
      }
      
      q1 <- p1 - p
      sr2 <- crossprod(q1)
      
      p2 <- updatePMSParam(p1, y)
      updEvalNum <- updEvalNum + 1
      if ( any(is.nan(unlist(p2))) ) {
        stop("Error in function evaluation")
      }
      
      q2 <- p2 - p1
      sq2 <- sqrt(crossprod(q2))
      sv2 <- crossprod(q2 - q1)
      srv <- crossprod(q1, q2 - q1)
      
      # alpha <- switch(ctrl$version, -srv/sv2, -sr2/srv, sqrt(sr2/sv2))
      alpha <- -srv/sv2
      alpha <- max(step.min, min(step.max, alpha))
      p.new <- p + 2 * alpha * q1 + alpha^2 * (q2 - q1)
      
      # This step is done in the original turboEM code...
      # but I cannot understand why this step is necessary....
      if (isTRUE(abs(alpha - 1) > 0.01) ) {
        p.new <- updatePMSParam(p.new, y)
        updEvalNum <- updEvalNum + 1
      }
      
      # when p.new has some problems...
      if (any(is.nan(p.new)) | !PMSboundary(y)(p.new) ) {
        
        p.new <- p2
        newL <- calcPMSLikelihood(p2, y)
        LEvalNum  <- LEvalNum + 1
        
        # since there was a problem, consider to reduce the amount of step max
        if (isTRUE(all.equal(alpha, step.max))) {
          step.max <- max(step.max0, step.max / mstep)
        }
        alpha <- 1
        
        # when p.new is O.K....
      } else {
        
        newL <- calcPMSLikelihood(p.new, y)
        LEvalNum  <- LEvalNum + 1
        
        # when the calculated log-likelihood has some problems
        # or the difference betwen the calculated log-likelihood is large...
        if (is.nan(newL) | (newL > prevL + objfn.inc)) {
          
          p.new <- p2
          lnew <- calcPMSLikelihood(p2, y)
          LEvalNum  <- LEvalNum + 1
          
          # since there was a problem, consider to reduce the amount of step max
          if (alpha == step.max) {
            step.max <- max(step.max0, step.max / mstep)
          }
          alpha <- 1
          
        } else {
          useSquareEM <- useSquareEM + 1
        }
        
      }
      
      if (isTRUE(all.equal(alpha, step.max))) {
        step.max <- mstep * step.max
      }
      
      if (step.min < 0 & isTRUE(all.equal(alpha, step.min))) {
        step.min <- mstep * step.min
      }
      
      p <- p.new
      
      # for debugging
      # print(c(updEvalNum, LEvalNum, useSquareEM, step.min, step.max, newL))
      
      if (abs(prevL - newL) < tol) {
        convFlag <- TRUE
        break
      }
      
      if (!is.nan(newL)) {
        prevL <- newL
      }
      
    }
    
    calcTime <- proc.time() - startTime
    
    return(list(par = p,
                value.objfn = newL,
                itr = iterNum,
                fpeval = updEvalNum,
                convergence = convFlag,
                elapsed.time = calcTime[3]))
    
}



#' A function for updating parameters using EM-algorithm
#' 
#' @param p this variable includes the parameters for mutation signatures and 
#'          membership parameters
#' @param y this variable includes the information on the mutation features, 
#'          the number of mutation signatures specified and so on
#' @return a value
updatePMSParam <- function(p, y) {
  
    sampleNum <- y[[1]][[1]]
    fdim <- y[[1]][[2]]
    patternList <- y[[1]][[3]]
    sparseCount <- y[[1]][[4]]
    K <- y[[2]]
    isBG <- y[[3]]
    BG0 <- y[[4]]
    
    patternNum <- ncol(patternList)
    samplePatternNum <- ncol(sparseCount)
  
    
    if (isBG) {
      varK <- K - 1
    } else {
      varK <- K
    }
    
    lenF <- varK * (sum(fdim) - length(fdim))
    lenQ <- (K - 1) * sampleNum
    F <- convertFromTurbo_F(p[seq_len(lenF)], fdim, K, isBG)
    Q <- convertFromTurbo_Q(p[(lenF + 1):(lenF + lenQ)], K, sampleNum)
    
    dim(Q) <- c(sampleNum, K)
    Q <- t(Q)
    ####################
    # E-step
    Theta <- updateTheta_NormalizedC(as.vector(patternList), 
                                     as.vector(sparseCount), as.vector(F), 
                                     as.vector(Q), fdim, K, sampleNum, 
                                     patternNum, samplePatternNum, isBG, BG0)
  
    dim(Theta) <- c(K, samplePatternNum)
    
    ####################
    # M-step 
    F_Q <- updateMstepFQC(as.vector(patternList), as.vector(sparseCount), 
                          as.vector(Theta), fdim, K, sampleNum, 
                          patternNum, samplePatternNum, isBG)
    #########################################
    F <- F_Q[seq_len((varK * length(fdim) * max(fdim)))]
    Q <- F_Q[(varK * length(fdim) * max(fdim) + 1):
               (varK * length(fdim) * max(fdim) + K * sampleNum)]
    dim(F) <- c(varK, length(fdim), max(fdim))
    dim(Q) <- c(K, sampleNum)
    Q <- t(Q)
    
    return(c(convertToTurbo_F(as.vector(F), fdim, K, isBG),  
             convertToTurbo_Q(as.vector(Q), K, sampleNum)))
    
}


#' A function for calculating the log-likelihood from the data and parameters
#' 
#' @param p this variable includes the parameters for mutation signatures and 
#'          membership parameters
#' @param y this variable includes the information on the mutation features, 
#'          the number of mutation signatures specified and so on
#' @return a value
calcPMSLikelihood <- function(p, y) {
  
    sampleNum <- y[[1]][[1]]
    fdim <- y[[1]][[2]]
    patternList <- y[[1]][[3]]
    sparseCount <- y[[1]][[4]]
    K <- y[[2]]
    isBG <- y[[3]]
    BG0 <- y[[4]]
    
    patternNum <- ncol(patternList)
    samplePatternNum <- ncol(sparseCount)
    
    
    if (isBG) {
      varK <- K - 1
    } else {
      varK <- K
    }
    
    lenF <- varK * (sum(fdim) - length(fdim))
    lenQ <- (K - 1) * sampleNum
    F <- convertFromTurbo_F(p[seq_len(lenF)], fdim, K, isBG)
    Q <- convertFromTurbo_Q(p[(lenF + 1):(lenF + lenQ)], K, sampleNum)
    
    dim(Q) <- c(sampleNum, K)
    Q <- t(Q)
    ####################
    
    return(getLogLikelihoodC(as.vector(patternList), as.vector(sparseCount), 
                             as.vector(F), as.vector(Q), fdim, K, 
                             sampleNum, patternNum, samplePatternNum, 
                             isBG, BG0))

}


#' A functional for generating the function checking the parameter (p) is 
#' within the restricted conditions or not
#' 
#' @param y this variable includes the information on the mutation features, 
#' the number of mutation signatures specified and so on
#' @return a functional
PMSboundary <- function(y) {
  
    sampleNum <- y[[1]][[1]]
    fdim <- y[[1]][[2]]
    patternList <- y[[1]][[3]]
    sparseCount <- y[[1]][[4]]
    K <- y[[2]]
    isBG <- y[[3]]
    F0 <- y[[4]]
    
    patternNum <- ncol(patternList)
    samplePatternNum <- ncol(sparseCount)
    
    
    if (isBG) {
      varK <- K - 1
    } else {
      varK <- K
    }
    
    lenF <- varK * (sum(fdim) - length(fdim))
    lenQ <- (K - 1) * sampleNum
  
    
    function(p) {
      return(all(boundaryTurbo_F(p[seq_len(lenF)], fdim, varK), 
                 boundaryTurbo_Q(p[(lenF + 1):(lenF + lenQ)], K, sampleNum)))
    }
    
}

Any scripts or data that you put into this service are public.

HiLDA documentation built on Nov. 8, 2020, 11:09 p.m.

rdrr.io home R language documentation Run R code online

CRAN packages Bioconductor packages R-Forge packages GitHub packages

Note that we can't provide technical support on individual packages. You should contact the package authors for that.

HiLDA
Conducting statistical inference on comparing the mutational exposures of mutational signatures by using hierarchical latent Dirichlet allocation

R/pm_getSignature.R
In HiLDA: Conducting statistical inference on comparing the mutational exposures of mutational signatures by using hierarchical latent Dirichlet allocation

Defines functions PMSboundary calcPMSLikelihood updatePMSParam mySquareEM pmgetSignature

Documented in calcPMSLikelihood mySquareEM pmgetSignature PMSboundary updatePMSParam

Try the HiLDA package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

HiLDA Conducting statistical inference on comparing the mutational exposures of mutational signatures by using hierarchical latent Dirichlet allocation

R/pm_getSignature.R In HiLDA: Conducting statistical inference on comparing the mutational exposures of mutational signatures by using hierarchical latent Dirichlet allocation

Defines functions PMSboundary calcPMSLikelihood updatePMSParam mySquareEM pmgetSignature

Documented in calcPMSLikelihood mySquareEM pmgetSignature PMSboundary updatePMSParam

Try the HiLDA package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

HiLDA
Conducting statistical inference on comparing the mutational exposures of mutational signatures by using hierarchical latent Dirichlet allocation

R/pm_getSignature.R
In HiLDA: Conducting statistical inference on comparing the mutational exposures of mutational signatures by using hierarchical latent Dirichlet allocation