R/utils.R

Defines functions summary.rlm_dce rlm_dce estimate_latent_count trueEffects make.log.link estimateTheta resample_edge_weights create_random_DAG get_prediction_counts Gsolve g2dag pcor as_adjmat permutation_test

Documented in as_adjmat create_random_DAG estimate_latent_count g2dag get_prediction_counts pcor permutation_test resample_edge_weights rlm_dce summary.rlm_dce trueEffects

#' Permutation test for (partial) correlation on non-Gaussian data
#'
#' Computes the significance of (partial) correlation based
#' on permutations of the observations
#' @param x wild type data set
#' @param y mutant data set
#' @param iter number of iterations (permutations)
#' @param fun function to compute the statistic, e.g., cor or pcor
#' @param mode either 1 for a function that takes a single data set
#' and produces an output of class matrix, and 2, if the function takes
#' two data sets
#' @param ... additional arguments for function 'fun'
#' @return matrix of p-values
#' @export
#' @examples
#' x <- matrix(rnorm(100),10,10)
#' y <- matrix(rnorm(100),10,10)
#' permutation_test(x,y,iter=10)
permutation_test <- function(x, y, iter = 1000, fun = pcor, mode = 1, ...) {
    if (mode == 1) {
        z <- fun(y, ...) - fun(x, ...)
    } else {
        z <- fun(x, y, ...)
    }
    p <- z * 0
    for (i in seq_len(iter)) {
        xy <- rbind(x, y)
        xyp <- xy[sample(seq_len(nrow(xy)), nrow(xy)), ]
        xp <- xyp[seq_len(nrow(x)), ]
        yp <- xyp[-seq_len(nrow(x)), ]
        if (mode == 1) {
            zp <- fun(yp, ...) - fun(xp, ...)
        } else {
            zp <- fun(xp, yp, ...)
        }
        idx <- which(abs(zp) >= abs(z))
        p[idx] <- p[idx] + 1
    }
    p <- p / iter
    return(p)
}

#' graph to adjacency
#'
#' From graphNEL with 0 edge weights to proper adjacency matrix
#' @param g graphNEL object
#' @export
#' @return graph as adjacency matrix
#' @importFrom naturalsort naturalorder
#' @examples
#' dag <- create_random_DAG(30, 0.2)
#' adj <- as_adjmat(dag)
as_adjmat <- function(g) {
    adj <- as(g, "matrix")
    for (p in names(g@edgeData@data)) {
        a <- gsub("\\|.*", "", p)
        b <- gsub(".*\\|", "", p)
        adj[a, b] <- 1
    }
    adj <- adj[naturalorder(rownames(adj)), naturalorder(colnames(adj))]
    return(adj)
}

#' Partial correlation
#'
#' Robust partial correlation of column variables of a
#' numeric matrix
#' @param x matrix
#' @param g related graph as adjacency matrix (optional)
#' @param adjustment_type character string for the method
#' to define the adjustment set Z for the regression
#' @param ... additional arguments for function 'cor'
#' @importFrom ppcor pcor
#' @export
#' @return matrix of partial correlations
#' @examples
#' x <- matrix(rnorm(100),10,10)
#' pcor(x)
pcor <- function(x, g = NULL, adjustment_type = "parents", ...) {
    if (is.null(g)) {
        rho <- try(ppcor::pcor(x, ...), silent = TRUE)
        if (length(grep("Error", rho)) > 0) {
            warning(paste0("Moore-Penrose generalized matrix invers in ",
                           "function ppcor::pcor crashed. Using ",
                           "MAAS::ginv instead."))
            omega <- cor(x, ...)
            p <- Gsolve(omega)
            pdiag <- diag(p) %*% t(diag(p))
            rho <- -p / (pdiag^0.5)
            diag(rho) <- 1
            rho[which(is.na(rho) | is.infinite(rho))] <- 0
        } else {
            rho <- rho$estimate
        }
        rownames(rho) <- colnames(rho) <- colnames(x)
    } else {
        edges <- which(g == 1, arr.ind = TRUE)
        omega <- cor(x, ...)
        rho <- omega * 0
        for (i in seq_len(nrow(edges))) {
            x <- edges[i, 1]
            y <- edges[i, 2]
            z <- get_adjustment_set(
                g, x, y,
                adjustment_type,
                effect_type = "total"
            )
            z <- which(colnames(g) %in% z)
            p <- Gsolve(omega[c(x, y, z), c(x, y, z)])
            pdiag <- diag(p) %*% t(diag(p))
            rhoz <- -p / (pdiag^0.5)
            diag(rhoz) <- 1
            rhoz[which(is.na(rhoz) | is.infinite(rhoz))] <- 0
            rho[x, y] <- rhoz[1, 2]
        }
    }
    return(rho)
}


#' Graph to DAG
#'
#' Converts a general graph to a dag with minimum distance to the
#' original graph. The general idea is to transitively close the graph
#' to detect cycles and remove them based on the rule "the more outgoing
#' edges a node has, the more likely it is that incoming edges from a
#' cycle will be deleted, and vice versa. However, this is too rigorous
#' and deletes too many edges, which do not lead to a cycle. These
#' edges are added back in the final step.
#' @param g graph as adjacency matrix
#' @param tc if TRUE computes the transitive closure
#' @author Ken Adams
#' @return dag as adjacency matrix
#' @export
#' @importFrom mnem transitive.closure
#' @import logger
#' @examples
#' g <- matrix(c(1,0,1,0, 1,1,0,0, 0,1,1,0, 1,1,0,1), 4, 4)
#' rownames(g) <- colnames(g) <- LETTERS[seq_len(4)]
#' dag <- g2dag(g)
g2dag <- function(g, tc = FALSE) {
    ord <- order(apply(g, 1, sum) - apply(g, 2, sum), decreasing = 1)
    g <- g[ord, ord]
    cyc <- intersect(which(g + t(g) > 1), which(lower.tri(g) == TRUE))
    g[cyc] <- 0
    diag(g) <- 1
    g2 <- g
    g0 <- g * 0
    epiNEM::HeatmapOP(g, Colv = 0, Rowv = 0)
    for (i in seq_len(nrow(g))) {
        g2 <- g2 %*% g
        g2[which(g2 > 0)] <- 1
        ord <- order(apply(g2, 1, sum) - apply(g2, 2, sum), decreasing = 1)
        g2 <- g2[ord, ord]
        g <- g[ord, ord]
        cyc <- intersect(which(g2 + t(g2) > 1), which(lower.tri(g2) == TRUE))
        g2[cyc] <- 0
        if (all(g0 == g2)) {
            break
        }
        g0 <- g2
        epiNEM::HeatmapOP(g2, Colv = 0, Rowv = 0)
    }
    if (!tc) {
        logger::log_trace(g)
        g3 <- g2 * 0
        g3[which(g2 == 1 & g == 1)] <- 1
        for (e in which(g == 1 & g3 == 0)) {
            for (a in seq_len(ncol(g))) {
                b <- e - a * nrow(g)
                if (b < 0) {
                    b <- e - (a - 1) * nrow(g)
                    break
                }
            }
            logger::log_trace(e)
            logger::log_trace(b)
            logger::log_trace(a)
            if (sum(g[b, ]) >= sum(g[a, ])) {
                g3[b, a] <- 1
            }
            g4 <- transitive.closure(g3)
            ord <- order(apply(g4, 1, sum) - apply(g4, 2, sum), decreasing = 1)
            g4 <- g4[ord, ord]
            if (any(g2 + t(g2) > 1)) {
                g3[a, b] <- 0
            }
        }
        g2 <- g3
    }
    return(g2)
}


#' @importFrom MASS ginv
#' @noRd
Gsolve <- function(a, b=NULL, ...) {
    if (!is.null(b)) {
        x <- t(solve(diag(1, nrow(a)), MASS::ginv(a, ...) %*% b))
    } else {
        x <- MASS::ginv(a, ...)
    }
    return(x)
}


#' Compute true positive/... counts
#'
#' Useful for performance evaluations
#' @param truth Ground truth
#' @param inferred Computed results
#' @param cutoff Threshold for classification
#' @author Hans Wurst
#' @return data.frame
#' @export
#' @examples
#' get_prediction_counts(c(1,0), c(1,1))
get_prediction_counts <- function(truth, inferred, cutoff = 0.5) {
    tp <- sum(
        abs(truth) > cutoff &
        abs(inferred) > cutoff &
        sign(truth) == sign(inferred) & inferred != 0,
        na.rm = TRUE
    )
    fn <- sum(
        abs(truth) > cutoff &
        abs(inferred) <= cutoff &
        inferred != 0,
        na.rm = TRUE
    )
    fp <- sum(
        abs(truth) <= cutoff &
        abs(inferred) > cutoff |
        (
            abs(truth) > cutoff &
            abs(inferred) > cutoff &
            sign(truth) != sign(inferred)
        ) &
        inferred != 0,
        na.rm = TRUE
    )
    tn <- sum(
        abs(truth) <= cutoff &
        abs(inferred) <= cutoff &
        inferred != 0,
        na.rm = TRUE
    )

    return(data.frame(
        true.positive = tp,
        false.positive = fp,
        true.negative = tn,
        false.negative = fn
    ))
}


#' Create random DAG (topologically ordered)
#'
#' Creates a DAG according to given parameters.
#' @param node_num Number of nodes
#' @param prob Probability of creating an edge
#' @param eff_min Lower bound for edge weights
#' @param eff_max Upper bound for edge weights
#' @param node_labels Node labels
#' @param max_par Maximal number of parents
#' @author Martin Pirkl
#' @return graph
#' @export
#' @importFrom methods new
#' @examples
#' dag <- create_random_DAG(30, 0.2)
create_random_DAG <- function(
    node_num, prob,
    eff_min = -1, eff_max = 1,
    node_labels = paste0("n", as.character(seq_len(node_num))),
    max_par = 3
) {
    stopifnot(
        node_num >= 2,
        is.numeric(prob),
        length(prob) == 1,
        0 <= prob,
        prob <= 1,
        is.numeric(eff_min),
        is.numeric(eff_max),
        eff_min <= eff_max
    )

    # create (directed) adjacency matrix
    mat <- matrix(rbinom(node_num * node_num, 1, prob), node_num, node_num)
    mat[lower.tri(mat)] <- 0
    diag(mat) <- 0

    # make sure to abide to max parents
    mat <- apply(mat, 2, function(x) {
        if (sum(x) > max_par) {
            y <- sample(which(x == 1), max_par)
            x[-y] <- 0
        }
        return(x)
    })

    # assign effects
    mat[mat != 0] <- runif(sum(mat != 0), min = eff_min, max = eff_max)

    # set node labels
    rownames(mat) <- colnames(mat) <- node_labels

    # topologically sort graph
    nodes_sorted <- igraph::topo_sort(
        igraph::graph_from_adjacency_matrix(mat, weighted = TRUE)
    )
    mat <- mat[nodes_sorted, nodes_sorted]

    # return graph
    igraph::as_graphnel(
        igraph::graph_from_adjacency_matrix(mat, weighted = TRUE)
    )
}


#' Resample network edge weights
#'
#' Takes a graph and modifies edge weights.
#' @param g original graph
#' @param tp fraction of edge weights which will be modified
#' @param mineff minimal differential effect size
#' @param maxeff maximum effect effect size or standard deviation,
#' if method is "gauss"
#' @param method method for drawing the differential for the causal effects.
#' Can be "unif", "exp" or "gauss".
#' @author Martin Pirkl
#' @return graph with new edge weights
#' @export
#' @examples
#' graph.wt <- as(matrix(c(0,0,0,1,0,0,0,1,0), 3), "graphNEL")
#' graph.mt <- resample_edge_weights(graph.wt)
resample_edge_weights <- function(g, tp = 0.5,
                                  mineff = 1, maxeff = 2,
                                  method = "unif") {
    gold <- g
    g <- as(g, "matrix")
    changes <- floor((1 - tp) * sum(g != 0))
    gbin <- g
    gbin[which(gbin != 0)] <- 1
    gtr <- gtr2 <- mnem::transitive.reduction(gbin)
    diag(gtr) <- 1
    noedges <- 1
    start <- TRUE
    keep <- NULL
    while (changes > 0 & sum(gtr2) != 0 | start) {
        start <- FALSE
        if (changes == 1 & sum(gtr2) == 1) {
            keep <- c(keep, which(gtr2 == 1))
        } else {
            keep <- c(keep,
                      sample(which(gtr2 == 1),
                             min(changes, sum(gtr2))))
        }
        changes <- changes - sum(gtr2)
        diag(gtr2) <- 1
        gtr2[noedges] <- 1
        gtr2[keep] <- 1
        gtr2 <- gtr2 %*% gtr
        gtr2[which(gtr2 > 0)] <- 1
        noedges <- which(gbin == 0 & gtr2 == 1)
        gtr2[noedges] <- 0
        gtr2[keep] <- 0
    }
    g2 <- g
    change <- which(g != 0)
    change <- unlist(lapply(change, function(x) {
        eff <- g2[x]
        die <- sample(c(0, 1), 1)
        if (die) {
            if (method == "unif") {
                effnew <- runif(1, eff + mineff, eff + maxeff)
            } else if (method == "gauss") {
                effnew <- eff + mineff + abs(rnorm(1, 0, maxeff))
            } else if (method == "exp") {
                effnew <- eff + mineff + rexp(1, maxeff)
            }
        } else {
            if (method == "unif") {
                effnew <- runif(1, eff - maxeff, eff - mineff)
            } else if (method == "gauss") {
                effnew <- eff - mineff - abs(rnorm(1, 0, maxeff))
            } else if (method == "exp") {
                effnew <- eff - mineff - rexp(1, maxeff)
            }
        }
        return(effnew)
    }))
    g[which(g != 0)] <- change
    g[keep] <- g2[keep]
    edges <- which(gbin == 1, arr.ind = TRUE)
    for (i in seq(nrow(edges))) {
        edge <- paste0(
            rownames(gbin)[edges[i, 1]], "|", rownames(gbin)[edges[i, 2]]
        )
        gold@edgeData@data[[edge]]$weight <- g[edges[i, 1], edges[i, 2]]
    }
    return(gold)
}


#' @importFrom edgeR DGEList calcNormFactors estimateDisp
#' @noRd
estimateTheta <- function(data, ...) {
    if (is.null(dim(data))) {
        data <- matrix(data, length(data))
    }
    if (ncol(data) < 5) {
        mus <- apply(data, 2, mean)
        sigmas <- apply(data, 2, sd)
        thetas <- mus^2 / (sigmas^2 - mus)
        theta <- mean(thetas)
    } else {
        y <- edgeR::DGEList(counts = t(data))
        y <- edgeR::calcNormFactors(y)
        y <- edgeR::estimateDisp(y, ...)
        theta <- 1 / y$common.dispersion
    }
    return(theta)
}


#' @noRd
make.log.link <- function(base=exp(1)) {
    structure(list(
        linkfun = function(mu) log(mu, base),
        linkinv = function(eta) base**eta,
        mu.eta = function(eta) base**eta,
        valideta = function(eta) TRUE
    ), class = "link-glm")
}


#' Compute the true casual effects of a simulated dag
#'
#' This function takes a DAG with edgeweights as input and computes
#' the causal effects of all nodes on all direct and indirect children in the
#' DAG. Alternatively see pcalg::causalEffect for pairwise computation.
#' @param g graphNEL object
#' @param partial if FALSE computes the total causal effects and not just
#' the partial edge effects
#' @author Martin Pirkl
#' @return matrix of causal effects
#' @export
#' @importFrom pcalg causalEffect
#' @import graph tidyverse
#' @importFrom expm %^%
#' @importFrom naturalsort naturalorder
#' @examples
#' graph.wt <- as(matrix(c(0,0,0,1,0,0,0,1,0), 3), "graphNEL")
#' trueEffects(graph.wt)
trueEffects <- function(g, partial = FALSE) {
    a <- as(g, "matrix")
    a <- a[naturalorder(rownames(a)), naturalorder(colnames(a))]
    if (partial) {
        ae <- a
    } else {
        ae <- a
        for (i in 2:nrow(a)) {
            ae <- ae + a %^% i
        }
    }
    return(ae)
}

#' Estimate number of latent confounders
#' Compute the true casual effects of a simulated dag
#'
#' This function takes a DAG with edgeweights as input and computes
#' the causal effects of all nodes on all direct and indirect children in the
#' DAG. Alternatively see pcalg::causalEffect for pairwise computation.
#' @param X1 data matrix corresponding to the first condition
#' @param X2 data matrix corresponding to the second condition
#' @param method a string indicating the method used for estimating the number
#' of latent variables
#' @author Domagoj Ćevid
#' @return estimated number of latent variables
#' @export
#' @importFrom stats prcomp
#' @examples
#' graph1 <- create_random_DAG(node_num = 100, prob = .1)
#' graph2 <- resample_edge_weights(graph1, tp=0.15)
#' X1 <- simulate_data(graph1, n=200, latent = 3)
#' X2 <- simulate_data(graph2, n=200, latent = 3)
#' estimate_latent_count(X1, X2)
estimate_latent_count <- function(X1, X2, method = "auto") {
    if (method == "auto") {
        # this function estimates the number of principal components by
        # comparing the singular values to what they would be if the
        # variables were independent which is estimated by permuting the
        # columns of the data matrix
        permutation_thresholding <- function(X, quantile = 0.99) {
            N <- 50
            X <- scale(X)
            X <- X[, !is.na(apply(X, 2, sum))]
            X <- X[, sort(apply(X, 2, sd),
                          index.return = TRUE,
                          decreasing = TRUE)$ix[seq_len(min(1000, ncol(X)))]]
            r <- min(dim(X))

            evals <- matrix(0, nrow = N, ncol = r)
            for (i in seq_len(N)) {
                X_perm <- apply(X, 2, function(xx) sample(xx))
                evals[i, ] <- svd(X_perm, nu = 0, nv = 0)$d[seq_len(r)]
            }
            thresholds <- apply(
                evals, 2,
                function(xx) quantile(xx, probs = quantile)
            )

            # limit number of confounders to at most 10% of
            # the number of data points or variables
            limit <- min(ceiling(0.1 * r), 15)
            # last which crosses the threshold
            crosses <- (svd(X, nu = 0, nv = 0)$d > thresholds)[1:limit]
            return(max(which(c(TRUE, crosses)) - 1))
        }

        q1 <- permutation_thresholding(X1)
        q2 <- permutation_thresholding(X2)

        return(ceiling((q1 + q2) / 2))
    }

    if (method == "kim") {
        # this approach looks at the knee point of a scree plot
        X <- rbind(X1, X2)

        # fix "cannot rescale a constant/zero column to unit variance" in PCA
        X <- Filter(function(x) min(x) != max(x), as.data.frame(X))

        fit_pca <- prcomp(scale(X))

        scree <- fit_pca$sdev
        scree <- scree[seq_len(round(length(scree) / 2))]
        values <- seq(length(scree))

        d1 <- diff(scree) / diff(values) # first derivative
        d2 <- diff(d1) / diff(values[-1]) # second derivative
        idx <- which.max(abs(d2))

        return(idx)
    }

    if (method == "cluster") {
        X <- rbind(X1, X2)
        X <- Filter(function(x) min(x) != max(x), as.data.frame(X))
        fit_pca <- prcomp(scale(X))

        scree <- fit_pca$sdev
        clust <- kmeans(
            scree,
            centers = c(
                scree[1],
                scree[2],
                scree[round(length(scree) / 2 + 1)],
                scree[length(scree)]
            )
        )$cluster
        idx <- sum(clust == clust[1])
        return(idx)
    }
}
#' costum rlm function
#' @param ... see ?MASS::rlm
#' @importFrom MASS rlm
rlm_dce <- function(...) {
    x <- MASS::rlm(...)
    class(x) <- "rlm_dce"
    return(x)
}
#' summary for rlm_dce
#' @param object object of class 'rlm_dce'
#' @param ... see ?MASS::summary.rlm
#' @method summary rlm_dce
summary.rlm_dce <- function(object, ...) {
    # TODO: fix this...
    stop("MASS:::summary.rlm is unexported...")
}
cbg-ethz/dce documentation built on Oct. 29, 2022, 8:14 a.m.