R/F_RCM_NB.R

Defines functions RCM_NB

Documented in RCM_NB

#' Fit the RC(M) model with the negative binomial distribution.
#'
#' @details Includes fitting of the independence model, filtering out the
#' effect of confounders and fitting the RC(M) components in a constrained
#'  or an unconstrained way for any dimension k. Not intended to be called
#'  directly but only through the RCM() function
#'
#' @param X a nxp data matrix
#' @param k an scalar, number of dimensions in the RC(M) model
#' @param rowWeights a character string, either 'uniform' or 'marginal'
#'  row weights.
#' @param colWeights a character string, either 'uniform' or 'marginal'
#'  column weights.
#' @param tol a scalar, the relative convergende tolerance for the row scores
#'  and column scores parameters.
#' @param maxItOut an integer, the maximum number
#'  of iterations in the outer loop.
#' @param Psitol a scalar, the relative convergence tolerance
#'  for the psi parameters.
#' @param verbose a boolean, should information on iterations be printed?
#' @param global global strategy for solving non-linear systems, see ?nleqslv
#' @param nleqslv.control a list with control options, see nleqslv
#' @param jacMethod Method for solving non-linear equations, ?see nleqslv.
#' Defaults to Broyden. The difference with the newton method is that
#'  the Jacobian is not recalculated at every iteration,
#'  thereby speeding up the algorithm
#' @param dispFreq an integer, how many iterations the algorithm should wait
#'  before reestimationg the dispersions.
#' @param convNorm a scalar, the norm to use to determine convergence
#' @param prior.df an integer, see estDisp()
#' @param marginEst a character string, either 'MLE' or 'marginSums',
#' indicating how the independence model should be estimated
#' @param confModelMat an nxg matrix with confounders,
#'  with no reference levels and with intercept
#' @param confTrimMat an nxh matrix with confounders for filtering,
#' with all levels and without intercept
#' @param covModelMat an nxd matrix with covariates.
#' If set to null an unconstrained analysis is carried out,
#' otherwise a constrained one.
#' Factors must have been converted to dummy variables already
#' @param centMat a fxd matrix containing the contrasts to center
#'  the categorical variables. f equals the number of continuous variables +
#'  the total number of levels of the categorical variables.
#' @param prevCutOff a scalar the minimum prevalence needed to retain a taxon
#'  before the the confounder filtering
#' @param minFraction a scalar, total taxon abundance should equal minFraction*n
#'  if it wants to be retained before the confounder filtering
#' @param responseFun a characters string indicating the shape
#'  of the response function
#' @param record A boolean, should intermediate parameter estimates be stored?
#' @param control.outer a list of control options
#'  for the outer loop constrOptim.nl function
#' @param control.optim a list of control options for the optim() function
#' @param envGradEst a character string, indicating how the
#' environmental gradient should be fitted. 'LR' using the likelihood-ratio
#'  criterion, or 'ML' a full maximum likelihood solution
#' @param dfSpline a scalar, the number of degrees of freedom for the splines
#'  of the non-parametric response function, see VGAM::s()
#' @param vgamMaxit an integer,
#'  the maximum number of iteration in the vgam() function
#' @param degree an integer,
#'  the degree of the polynomial fit if the spline fit fails
#' @param rowExp,colExp exponents for the row and column weights of the singular value
#'  decomposition used to calculate starting values. Can be played around with
#'  in case of numerical troubles.
#' @param allowMissingness See RCM()
#'
#' @seealso \code{\link{RCM}}
#'
#' @return A list with elements
#' \item{converged}{a vector of booleans of length k indicating if the algorithm
#'  converged for every dimension}
#' \item{rMat}{if not constrained a nxk matrix with estimated row scores}
#' \item{cMat}{a kxp matrix with estimated column scores}
#' \item{psis}{a vector of length k
#'  with estimates for the importance parameters psi}
#' \item{thetas}{a vector of length p with estimates for the overdispersion}
#' \item{rowRec}{(if not constrained) a n x k x maxItOut array with a record
#'  of all rMat estimates through the iterations}
#' \item{colRec}{a k x p x maxItOut array with a record of all cMat
#'  estimates through the iterations}
#' \item{psiRec}{a k x maxItOut array with a record of all psi estimates
#'  through the iterations}
#' \item{thetaRec}{ a matrix of dimension pxmaxItOut with estimates for
#'  the overdispersion along the way}
#' \item{iter}{ number of iterations}
#' \item{Xorig}{ (if confounders provided) the original fitting matrix}
#' \item{X}{ the trimmed matrix if confounders provided,
#'  otherwise the original one}
#' \item{fit}{ type of fit, either 'RCM_NB' or 'RCM_NB_constr'}
#' \item{lambdaRow}{(if not constrained)
#'  vector of Lagrange multipliers for the rows}
#' \item{lambdaCol}{ vector of Lagrange multipliers for the columns}
#' \item{rowWeights}{(if not constrained) the row weights used}
#' \item{colWeights}{ the column weights used}
#' \item{alpha}{(if constrained) the kxd matrix of environmental gradients}
#' \item{alphaRec}{(if constrained) the kxdxmaxItOut array of alpha estimates
#'  along the iterations}
#' \item{covariates}{(if constrained) the matrix of covariates}
#' \item{libSizes}{ a vector of length n with estimated library sizes}
#' \item{abunds}{ a vector of length p with estimated mean relative abundances}
#' \item{confounders}{(if provided) the confounder matrix}
#' \item{confParams}{ the parameters used to filter out the confounders}
#' \item{nonParamRespFun}{A list of the non parametric response functions}
#' \item{degree}{The degree of the alternative parametric fit}
#' \item{NApresent}{A boolean, were NA values present?}
#' @export
#' @note Plotting is not supported for quadratic response functions
#' @examples
#' data(Zeller)
#' require(phyloseq)
#' tmpPhy = prune_taxa(taxa_names(Zeller)[seq_len(100)],
#' prune_samples(sample_names(Zeller)[seq_len(50)], Zeller))
#' mat = as(otu_table(tmpPhy), "matrix")
#' mat = mat[rowSums(mat)>0, colSums(mat)>0]
#' zellerRCM = RCM_NB(mat, k = 2)
#' #Needs to be called directly onto a matrix
RCM_NB = function(X, k, rowWeights = "uniform", colWeights = "marginal",
    tol = 0.001, maxItOut = 1000L, Psitol = 0.001,
    verbose = FALSE, global = "dbldog", nleqslv.control = list(maxit = 500L,
        cndtol = 1e-16), jacMethod = "Broyden", dispFreq = 10L,
    convNorm = 2, prior.df = 10, marginEst = "MLE",
    confModelMat = NULL, confTrimMat = NULL, prevCutOff,
    minFraction = 0.1, covModelMat = NULL, centMat = NULL,
    responseFun = c("linear", "quadratic", "dynamic",
        "nonparametric"), record = FALSE, control.outer = list(trace = FALSE),
    control.optim = list(), envGradEst = "LR", dfSpline = 3,
    vgamMaxit = 100L, degree = switch(responseFun[1],
        nonparametric = 3, NULL), rowExp = if(is.null(covModelMat)) 1 else 0.5,
    colExp = rowExp, allowMissingness = FALSE){

    Xorig = NULL  #An original matrix, not returned if no trimming occurs
    responseFun = responseFun[1]
    if (!responseFun %in% c("linear", "quadratic",
        "dynamic", "nonparametric")) {
        stop("Unknown response function provided! See ?RCM_NB for details.")
    }

    if (!is.null(confModelMat)) {
        # First and foremost: filter on confounders
        Xorig = X
        X = trimOnConfounders(X, confounders = confTrimMat,
            prevCutOff = prevCutOff, n = nrow(Xorig),
            minFraction = minFraction)
    }
    naId = if(allowMissingness) is.na(X) else NULL

    n = NROW(X)
    p = NCOL(X)

    thetas = matrix(0, p, k + 1 + (!is.null(confModelMat)),
        dimnames = list(colnames(X), c("Independence",
            if (!is.null(confModelMat)) "Filtered" else NULL,
            paste0("Dim", seq_len(k)))))
    # Track the overdispersions, also the one
    # associated to the independence model

    # Initialize some parameters
    abunds = colSums(X, na.rm = TRUE)/sum(X, na.rm = TRUE)
    libSizes = rowSums(X, na.rm = TRUE)

    # Get the trended-dispersion estimates.  Very
    # insensitive to the offset so only need to be
    # calculated once per dimension
    trended.dispersion <- edgeR::estimateGLMTrendedDisp(
        y = t(correctXMissingness(X, outer(libSizes, abunds),
                                    allowMissingness, naId)),
        design = NULL, method = "bin.loess", offset = t(log(outer(libSizes,
            abunds))), weights = NULL)
    if (marginEst == "MLE") {
        logLibSizesMLE = log(libSizes)
        logAbundsMLE = log(abunds)
        initIter = 1
        if (verbose)
            cat("\nEstimating the independence model \n\n")
        while ((initIter == 1) || ((initIter <= maxItOut) &&
            (!convergenceInit))) {
            logLibsOld = logLibSizesMLE
            logAbsOld = logAbundsMLE

            thetas[, "Independence"] = estDisp(X = X,
                rMat = as.matrix(rep(0, n)), cMat = t(as.matrix(rep(0,
                p))), muMarg = exp(outer(logLibSizesMLE,
                logAbundsMLE, "+")), psis = 0, prior.df = prior.df,
                trended.dispersion = trended.dispersion,
                allowMissingness = allowMissingness, naId = naId)
            thetasMat = matrix(thetas[, "Independence"],
                n, p, byrow = TRUE)

            logLibSizesMLE = nleqslv(fn = dNBlibSizes,
                x = logLibSizesMLE, theta = thetasMat,
                X = X, reg = logAbundsMLE, global = global,
                control = nleqslv.control, jac = NBjacobianLibSizes,
                method = jacMethod, allowMissingness = allowMissingness,
                naId = naId)$x
              logAbundsMLE = vapply(seq_len(p), FUN.VALUE = double(1), function(j){
                nleqslv(fn = dNBabunds,
                x = logAbundsMLE[j], theta = thetasMat[, j],
                    X = X[, j], reg = logLibSizesMLE, global = global,
                        control = nleqslv.control, jac = NBjacobianAbunds,
                        method = jacMethod, allowMissingness = allowMissingness, naId = naId[, j])$x
            })

            initIter = initIter + 1

            convergenceInit = ((initIter <= maxItOut) &&
                ((sum(abs(1 - logLibSizesMLE/logLibsOld)^convNorm)/n)^
                (1/convNorm) <
                tol) && ((sum(abs(1 - logAbundsMLE/logAbsOld)^convNorm)/p)^
                (1/convNorm) <
                tol))
        }
        # Converges very fast, even when dispersions are
        # re-estimated. For the library sizes there is a
        # big difference, for the abundances less so
        muMarg = exp(outer(logLibSizesMLE, logAbundsMLE,
            "+"))
        # The marginals to be used as expectation. These
        # are augmented with the previously estimated
        # dimensions every time
    } else if (marginEst == "marginSums") {
        muMarg = outer(libSizes, abunds)
    } else {
        stop("No valid margin estimation paradigm provided! \n")
    }

    rowWeights = switch(paste(marginEst, rowWeights,
        sep = "_"), marginSums_marginal = libSizes/sum(libSizes),
        MLE_marginal = exp(logLibSizesMLE)/sum(exp(logLibSizesMLE)),
        rep.int(1/n, n)  #For uniform weights
)
    colWeights = switch(paste(marginEst, colWeights,
        sep = "_"), marginSums_marginal = abunds,
        MLE_marginal = exp(logAbundsMLE),
        rep.int(1/p, p)  #For uniform weights
)

    nLambda = 2 * k + k * (k - 1)/2
    if (record) {
        # Pre-allocate arrays to track iterations
        rowRec = array(0, dim = c(n, k, maxItOut))
        colRec = thetaRec = array(0, dim = c(k, p,
            maxItOut))
        psiRec = matrix(0, nrow = k, ncol = maxItOut)
    } else {
        rowRec = colRec = thetaRec = psiRec = NULL
    }
    convergence = rep(FALSE, k)
    iterOut = rep(1, k)
    if (!is.null(confModelMat)) {
        ## Filter out the confounders by adding them to the
        ## intercept, also adapt overdispersions
        filtObj = filterConfounders(muMarg = muMarg,
            confMat = confModelMat, p = p, X = X, thetas = thetas[,
                1], nleqslv.control = nleqslv.control,
            n = n, trended.dispersion = trended.dispersion,
            allowMissingness = allowMissingness, naId = naId)
        muMarg = muMarg * exp(confModelMat %*% filtObj$NB_params)
        thetas[, "Filtered"] = filtObj$thetas
        confParams = filtObj$NB_params
    } else {
        confParams = NULL
    }
    ## 1) Initialization
    svdX = svd(diag(1/libSizes^rowExp) %*%
            (correctXMissingness(X, muMarg, allowMissingness, naId)
             - muMarg) %*% diag(1/colSums(X, na.rm = TRUE)^colExp))
    rMat = svdX$u[, seq_len(k), drop = FALSE]
    cMat = t(svdX$v[, seq_len(k), drop = FALSE])
    psis = svdX$d[seq_len(k)]

    lambdaRow = rep.int(0, nLambda)
    lambdaCol = rep.int(0, nLambda)
    # Center
    cMat = t(apply(cMat, 1, function(colS) {
        colS - sum(colS * colWeights)/sum(colWeights)
    }))
    rMat = apply(rMat, 2, function(rowS) {
        rowS - sum(rowS * rowWeights)/sum(rowWeights)
    })

    # Redistribute some weight to fit the constraints
    psis = c(psis * t(apply(cMat, 1, function(colS) {
        sqrt(sum(colWeights * colS^2))
    })) * apply(rMat, 2, function(rowS) {
        sqrt(sum(rowWeights * rowS^2))
    }))

    # Normalize
    cMat = t(apply(cMat, 1, function(colS) {
        colS/sqrt(sum(colWeights * colS^2))
    }))

    rMat = apply(rMat, 2, function(rowS) {
        rowS/sqrt(sum(rowWeights * rowS^2))
    })
    if (is.null(covModelMat)) {
        # If no covariates provided, perform an
        # unconstrained analysis
        for (KK in seq_len(k)) {

            if (verbose)
                cat("Dimension", KK, "is being esimated \n")

            # Modify offset if needed
            if (KK > 1) {
                muMarg = muMarg * exp(rMat[, (KK -
                1), drop = FALSE] %*% (cMat[(KK -
                1), , drop = FALSE] * psis[(KK -
                1)]))
            }
            idK = seq_k(KK)  #prepare an index

            # Re-estimate the trended dispersions, once per
            # dimensions
            trended.dispersion <- edgeR::estimateGLMTrendedDisp(
              y = t(correctXMissingness(X, muMarg, allowMissingness, naId)),
                design = NULL, method = "bin.loess",
                offset = t(log(muMarg)), weights = NULL)

            JacR = matrix(0, nrow = n + KK + 1, ncol = n +
                KK + 1)
            # Prepare sparse Jacobians, and prefill what we can
            JacR[seq_len(n), n + 1] = rowWeights
            if (KK > 1) {
                JacR[seq_len(n), (n + 3):(n + KK +
                1)] = rMat[, seq_len(KK - 1), drop = FALSE] *
                rowWeights
            }
            # Symmetrize
            JacR = JacR + t(JacR)

            JacC = matrix(0, nrow = p + KK + 1, ncol = p +
                KK + 1)
            JacC[seq_len(p), p + 1] = colWeights
            if (KK > 1) {
                JacC[seq_len(p), (p + 3):(p + KK +
                1)] = t(cMat[seq_len(KK - 1), , drop = FALSE]) *
                colWeights
            }
            # Symmetrize
            JacC = JacC + t(JacC)

            while ((iterOut[KK] == 1) || ((iterOut[KK] <=
                maxItOut) && (!convergence[KK]))) {

                if (verbose && iterOut[KK]%%1 == 0) {
                cat("\n", "Outer Iteration", iterOut[KK],
                    "\n", "\n")
                if (iterOut[KK] != 1) {
                    cat("Old psi-estimate: ", psisOld,
                    "\n")
                    cat("New psi-estimate: ", psis[KK],
                    "\n")
                }
                }
                ## 2)a. Store old parameters
                psisOld = psis[KK]
                rMatOld = rMat[, KK]
                cMatOld = cMat[KK, ]

                # Overdispersions (not at every iterations to speed
                # things up, the estimates do not change a lot
                # anyway)
                if ((iterOut[KK]%%dispFreq) == 0 ||
                (iterOut[KK] == 1)) {
                if (verbose)
                    cat(" Estimating overdispersions \n")
                thetas[, paste0("Dim", KK)] = estDisp(X = X,
                    rMat = rMat[, KK, drop = FALSE],
                    cMat = cMat[KK, , drop = FALSE],
                    muMarg = muMarg, psis = psis[KK],
                    prior.df = prior.df,
                    trended.dispersion = trended.dispersion,
                    allowMissingness = allowMissingness)
                thetasMat = matrix(thetas[, paste0("Dim",
                    KK)], n, p, byrow = TRUE)
                # Make a matrix for numerical reasons, it avoids
                # excessive use of the t() function
                preFabMat = 1 + X/thetasMat
                # Another matrix that can be pre-calculated
                }
                # Psis
                if (verbose)
                    cat("\n Estimating psis \n")
                regPsis = outer(rMat[, KK], cMat[KK,])
                psiTry = try(abs(nleqslv(fn = dNBpsis,
                    x = psis[KK], theta = thetasMat,
                    X = X, reg = regPsis, muMarg = muMarg,
                    global = global, control = nleqslv.control,
                    jac = NBjacobianPsi, method = jacMethod,
                    preFabMat = preFabMat, allowMissingness = allowMissingness, naId = naId)$x))
                if (inherits(psiTry, "try-error")) {
                stop("Fit failed, likely due to numeric reasons. Consider more
                stringent filtering by increasing the prevCutOff parameter.\n")
                } else {
                    psis[KK] = psiTry
                }

                  # Row scores
                if (verbose)
                    cat("\n Estimating row scores \n")
                regRow = cMat[KK, , drop = FALSE] *psis[KK]
                tmpRow = nleqslv(fn = dNBllrow,
                x = c(rMat[,KK], lambdaRow[idK]), thetas = thetasMat,
                X = X, reg = regRow, muMarg = muMarg,
                    k = KK, global = global, control = nleqslv.control,
                    n = n, p = p, jac = NBjacobianRow,
                    method = jacMethod, rowWeights = rowWeights,
                    nLambda = (KK + 1), rMatK = rMat[,
                    seq(1, (KK - 1)), drop = FALSE],
                    preFabMat = preFabMat, Jac = JacR,
                allowMissingness = allowMissingness)

                if (verbose)
                    cat(ifelse(tmpRow$termcd == 1, "Row scores converged \n",
                    "Row scores DID NOT converge \n"))
                rMat[, KK] = tmpRow$x[seq_len(n)]
                lambdaRow[idK] = tmpRow$x[n + seq_along(idK)]

                # Normalize (speeds up algorithm if previous step
                # had not converged)
                rMat[, KK] = rMat[, KK] - sum(rMat[,
                    KK] * rowWeights)/sum(rowWeights)
                rMat[, KK] = rMat[, KK]/sqrt(sum(rowWeights *
                    rMat[, KK]^2))

                # Column scores
                if (verbose)
                    cat("\n Estimating column scores \n")
                regCol = rMat[, KK, drop = FALSE] *
                    psis[KK]
                cMat[KK, ]  = vapply(seq_len(p), FUN.VALUE = double(1), function(j){
                    nleqslv(fn = dNBllcol, x = cMat[KK,j], thetas = thetasMat[, j],
                            X = X[, j], reg = regCol, muMarg = muMarg[, j],
                            k = KK, global = global, control = nleqslv.control,
                            n = n, p = p, jac = NBjacobianCol,
                            method = jacMethod, colWeights = colWeights,
                            nLambda = (KK + 1), cMatK = cMat[seq(1,(KK - 1)), , drop = FALSE], preFabMat = preFabMat[, j],
                            Jac = JacC, allowMissingness = allowMissingness, naId = naId[, j])$x
                })

                # Normalize and center here
                cMat[KK, ] = cMat[KK, ] - sum(cMat[KK,
                ] * colWeights)/sum(colWeights)
                cMat[KK, ] = cMat[KK, ]/sqrt(sum(colWeights *
                                                     cMat[KK, ]^2))
                #From second dimension on: Gram-Schmidt ortogonalize with respect to previous dimensions
                if(KK>1)
                    cMat[KK, ] = GramSchmidt(cMat[KK, ], cMat[seq_len(KK-1), ,drop = FALSE], weights = colWeights)

                if (record) {
                    # Store intermediate estimates
                    rowRec[, KK, iterOut[KK]] = rMat[,
                    KK]
                    colRec[KK, , iterOut[KK]] = cMat[KK,
                    ]
                thetaRec[KK, , iterOut[KK]] = thetas[,
                    paste0("Dim", KK)]
                psiRec[KK, iterOut[KK]] = psis[KK]
                }

                ## Change iterator
                iterOut[KK] = iterOut[KK] + 1

                ## Check convergence (any numbered norm for row and
                ## column scores)
                convergence[KK] = ((iterOut[KK] <=
                maxItOut) && (abs(1 - psis[KK]/psisOld) <
                Psitol) && ((mean(abs(1 - rMat[,
                KK]/rMatOld)^convNorm))^(1/convNorm) <
                tol) && ((mean(abs(1 - cMat[KK, ]/cMatOld)^convNorm))^
                (1/convNorm) <
                tol))
            }  # END while-loop until convergence
        }  # END for-loop over dimensions

        ## 3) Termination
        rownames(rMat) = rownames(X)
        colnames(cMat) = colnames(X)
        rownames(cMat) = colnames(rMat) = paste0("Dim", seq_len(k))

        returnList = list(rMat = rMat, cMat = cMat,
            rowRec = rowRec, colRec = colRec, psiRec = psiRec,
            thetaRec = thetaRec, fit = "RCM_NB", lambdaRow = lambdaRow,
            lambdaCol = lambdaCol)
    } else {
        # If covariates provided, do a constrained analysis
        d = ncol(covModelMat)
        CCA = constrCorresp(X = correctXMissingness(X, muMarg, allowMissingness, naId),
                         Y = covModelMat, rowExp = rowExp, colExp = colExp)
        # Constrained correspondence analysis for starting values
        if (sum(!colnames(covModelMat) %in% CCA$alias) <
            k) {
            k = sum(!colnames(covModelMat) %in% CCA$alias)
            warning(immediate. = TRUE, paste("Can only fit an ordination with",
                k, "dimensions with so few covariates!"))
        }
        alpha = matrix(0, d, k)
        alpha[!colnames(covModelMat) %in% CCA$alias,
            ] = CCA$biplot[, seq_len(k)]
        # Leave the sum constraints for the factors alone
        # for now, may or may not speed up the algorithm
        alpha = t(t(alpha) - colMeans(alpha))
        alpha = t(t(alpha)/sqrt(colSums(alpha^2)))
        psis = CCA$eig[seq_len(k)]
        alphaRec = if (record) {
            array(0, dim = c(d, k, maxItOut))
        } else {
            NULL
        }
        v = switch(responseFun, linear = 2, quadratic = 3,
            dynamic = 3, 1)
        # Number of parameters per taxon
        NB_params = array(0.1, dim = c(v, p, k))
        # Initiate parameters of the response function,
        # taxon-wise.  No zeroes or trivial fit! Improved
        # starting values may be possible.
        NB_params = if (responseFun != "nonparametric")
            vapply(seq_len(k), FUN.VALUE = matrix(0,
                v, p), function(x) {
                x = NB_params[, , x, drop = FALSE]
                x/sqrt(rowSums(x^2))
            }) else NULL
        NB_params_noLab = if (responseFun != "nonparametric" &&
            envGradEst == "LR")
            matrix(0.1, v, k) else NULL
        #Initiate parameters of the response function, ignoring taxon-labels
        if (responseFun == "nonparametric") {
            nonParamRespFun = lapply(seq_len(k), function(x) {
                list(taxonWise = lapply(integer(p),
                function(d) {
                    list(fit = list(coef = NULL))
                }), overall = NULL)
            })
        } else {
            nonParamRespFun = NULL
        }
        rowMat = NULL

        # Number of lambda parameters for centering
        nLambda1s = NROW(centMat)

        lambdasAlpha = rep(0, (k * (1 + nLambda1s +
            (k - 1)/2)))
        for (KK in seq_len(k)) {
            if (verbose)
                cat("Dimension", KK, "is being esimated \n")

            # Modify offset if needed
            if (KK > 1) {
                muMarg = if (responseFun %in% c("linear",
                "quadratic", "dynamic")) {
                exp(getRowMat(responseFun = responseFun,
                    sampleScore = covModelMat %*% alpha[,
                    KK - 1, drop = FALSE], NB_params = NB_params[,
                    , KK - 1]) * psis[KK - 1]) *
                    muMarg
                } else {
                exp(nonParamRespFun[[KK - 1]]$rowMat) *
                    muMarg
                }
            }

            idK = seq_k(KK)
            ## 2) Propagation
            while ((iterOut[KK] == 1) || ((iterOut[KK] <=
                maxItOut) && (!convergence[KK]))) {

                if (verbose && iterOut[KK]%%1 == 0) {
                cat("\n", "Outer Iteration", iterOut[KK],
                    "\n", "\n")
                if (iterOut[KK] != 1 && responseFun !=
                    "nonparametric") {
                    cat("Old psi-estimate: ", psisOld,
                    "\n")
                    cat("New psi-estimate: ", psis[KK],
                    "\n")
                }
                }
                ## 2)a. Store old parameters to check for
                ## convergence
                psisOld = psis[KK]
                alphaOld = alpha[, KK]
                NBparamsOld = NB_params[, , KK]

                sampleScore = covModelMat %*% alpha[,KK]
                envRange = range(sampleScore)
                if (responseFun %in% c("linear", "quadratic","dynamic")) {
                    design = buildDesign(sampleScore,responseFun)
                    rowMat = design %*% NB_params[, ,KK]
                } else {
                    rowMat = nonParamRespFun[[KK]]$rowMat
                }
                # Overdispersions (not at every iterations to speed
                # things up, doesn't change a lot anyway)
                if ((iterOut[KK]%%dispFreq) == 0 ||
                    iterOut[KK] == 1) {
                    if (verbose)
                    cat(" Estimating overdispersions \n")
                    thetas[, paste0("Dim", KK)] = estDisp(X = X,
                    muMarg = muMarg, psis = psis[KK],
                    prior.df = prior.df,
                    trended.dispersion = trended.dispersion,
                    rowMat = rowMat, allowMissingness = allowMissingness, naId = naId)
                    thetasMat = matrix(thetas[, paste0("Dim",
                    KK)], n, p, byrow = TRUE)
                    preFabMat = 1 + X/thetasMat
                }

                if (responseFun %in% c("linear", "quadratic","dynamic")) {
                # Psis
                if (verbose)
                    cat("\n Estimating psis (k = ",KK, ") \n", sep = "")
                psis[KK] = abs(nleqslv(fn = dNBpsis,
                    x = psis[KK], theta = thetasMat,
                    X = X, reg = rowMat, muMarg = muMarg,
                    global = global, control = nleqslv.control,
                    jac = NBjacobianPsi, method = jacMethod,
                    preFabMat = preFabMat, allowMissingness = allowMissingness, naId = naId)$x)

                if (verbose)
                    cat("\n Estimating response function \n")
                NB_params[, , KK] = estNBparams(design = design,
                    thetas = thetas[, paste0("Dim",
                    KK)], muMarg = muMarg, psi = psis[KK],
                    X = X, nleqslv.control = nleqslv.control,
                    ncols = p, initParam = NB_params[,
                    , KK], v = v, dynamic = responseFun ==
                    "dynamic", envRange = envRange,
                    allowMissingness = allowMissingness)
                NB_params[, , KK] = NB_params[, , KK]/
                sqrt(rowSums(NB_params[, ,KK]^2))

                if (envGradEst == "LR") {
                    NB_params_noLab[, KK] = estNBparamsNoLab(design = design,
                    thetasMat = thetasMat, muMarg = muMarg,
                    psi = psis[KK], X = X, nleqslv.control = nleqslv.control,
                    initParam = NB_params_noLab[,KK], v = v,
                    dynamic = responseFun ==
                        "dynamic", envRange = envRange,
                    preFabMat = preFabMat, n = n,
                    allowMissingness = allowMissingness)
                }
                if (verbose)
                    cat("\n Estimating environmental gradient \n")
                    AlphaTmp = nleqslv(x = c(alpha[,
                    KK], lambdasAlpha[seq_k(KK, nLambda1s)]),
                    fn = dLR_nb, jac = LR_nb_Jac, X = X,
                    CC = covModelMat, responseFun = responseFun,
                    cMat = cMat, psi = psis[KK], NB_params = NB_params[,, KK],
                    NB_params_noLab = NB_params_noLab[,
                    KK], alphaK = alpha[, seq_len(KK -1), drop = FALSE],
                    k = KK, d = d,
                    centMat = centMat, nLambda = nLambda1s + KK,
                    nLambda1s = nLambda1s, thetaMat = thetasMat,
                    muMarg = muMarg, control = nleqslv.control,
                    n = n, v = v, ncols = p, preFabMat = preFabMat,
                    envGradEst = envGradEst, allowMissingness = allowMissingness, naId = naId)$x
                    alpha[, KK] = AlphaTmp[seq_len(d)]
                    lambdasAlpha[seq_k(KK, nLambda1s)] = AlphaTmp[-seq_len(d)]
                } else {
                if (verbose)
                    cat("\n Estimating response functions \n")
                    nonParamRespFun[[KK]] = estNPresp(sampleScore = sampleScore,
                    muMarg = muMarg, X = X, ncols = p,
                    thetas = thetas[, paste0("Dim",KK)],
                    n = n, coefInit = nonParamRespFun[[KK]]$taxonCoef,
                    coefInitOverall = nonParamRespFun[[KK]]$overall$coef,
                    vgamMaxit = vgamMaxit, dfSpline = dfSpline,
                    verbose = verbose, degree = degree,
                    allowMissingness = allowMissingness, naId = naId)
                    if (verbose)
                    cat("\n Estimating environmental gradient \n")
                    AlphaTmp = constrOptim.nl(par = alpha[,KK],
                    fn = LR_nb, gr = NULL, heq = heq_nb,
                    heq.jac = heq_nb_jac, alphaK = alpha[,
                    seq_len(KK - 1), drop = FALSE],
                    X = X, CC = covModelMat, responseFun = responseFun,
                    muMarg = muMarg, d = d, ncols = p,
                    control.outer = control.outer,
                    control.optim = control.optim,
                    k = KK, centMat = centMat, n = n,
                    nonParamRespFun = nonParamRespFun[[KK]],
                    thetaMat = thetasMat, envGradEst = envGradEst,
                    allowMissingness = allowMissingness)
                    alpha[, KK] = AlphaTmp$par
                    lambdasAlpha[seq_k(KK, nLambda1s)] = AlphaTmp$lambda
                    }

                # Store intermediate estimates
                if (record) {
                    alphaRec[, KK, iterOut[KK]] = alpha[,KK]
                    thetaRec[KK, , iterOut[KK]] = thetas[,
                    paste0("Dim", KK)]
                    psiRec[KK, iterOut[KK]] = psis[KK]
                }
                ## Change iterator
                iterOut[KK] = iterOut[KK] + 1

                ## Check convergence (any numbered norm for row and
                ## column scores)
                convergence[KK] = ((iterOut[KK] <=maxItOut) &&
                (abs(1 - psis[KK]/psisOld) < Psitol) &&
                ((mean(abs(1 - alpha[,KK]/alphaOld)^convNorm))^(1/convNorm) <
                tol) && if (responseFun == "nonparametric")
                TRUE else (mean(abs(1 - (NB_params[, , KK]/NBparamsOld)[
                NBparamsOld *
                NB_params[, , KK] != 0])^convNorm)^(1/convNorm) <
                tol))
            }  # END while-loop until convergence

        }  # END for-loop over dimensions
        ## 3) Termination

        if (responseFun == "nonparametric") {
            # Post hoc calculate integrals and psis
            nonParamRespFun = lapply(seq_len(k), function(KK) {
                samScore = covModelMat %*% alpha[,KK]
                nonPar = nonParamRespFun[[KK]]
                nonPar$intList = vapply(FUN.VALUE = numeric(1),
                    colnames(X), function(tax) {
                    getInt(coef = nonParamRespFun[[KK]]$taxonCoef[[tax]],
                    spline = nonParamRespFun[[KK]]$splineList[[tax]],
                    sampleScore = samScore)
                })
                return(nonPar)
            })
            psis = vapply(nonParamRespFun, function(x) {
                sqrt(mean(x$rowMat^2))
            }, numeric(1))
            names(nonParamRespFun) = names(psis) = paste0("Dim",
                seq_len(k))
        }

        rownames(alpha) = colnames(covModelMat)
        colnames(alpha) = paste0("Dim", seq_len(k))

        returnList = list(fit = "RCM_NB_constr", lambdaCol = lambdaCol,
            alpha = alpha, alphaRec = alphaRec, covariates = covModelMat,
            NB_params = NB_params, NB_params_noLab = NB_params_noLab,
            responseFun = responseFun, nonParamRespFun = nonParamRespFun,
            envGradEst = envGradEst, lambdasAlpha = lambdasAlpha,
            degree = degree)
    }
    if (!all(convergence)) {
        warning(paste0("Algorithm did not converge for dimensions ",
            paste(which(!convergence), collapse = ","),
            "! Check for errors or consider changing tolerances
        or number of iterations"))
    }
    return(c(returnList, list(converged = convergence,
        psis = psis, thetas = thetas, psiRec = psiRec,
        thetaRec = thetaRec, iter = iterOut - 1, X = X,
        Xorig = Xorig, rowWeights = rowWeights, colWeights = colWeights,
        libSizes = switch(marginEst, MLE = exp(logLibSizesMLE),
            marginSums = libSizes), abunds = switch(marginEst,
            MLE = exp(logAbundsMLE), marginSums = abunds),
        confounders = confModelMat, confParams = confParams,
        NApresent = allowMissingness)))
}
CenterForStatistics-UGent/RCM documentation built on April 24, 2023, 8:26 p.m.