deco: Decomposing Heterogeneous Cohorts using Omic Data Profiling

Documented in AnnotateDECO

########################################################################################################### Intern functions ##########

# Printing messages

.timestamp <- function() format(Sys.time(), " %H:%M:%S")

# Temporary directory

.createTempFile <- function() {
    uniqueName <- unlist(strsplit(tempfile(pattern = "DECO_temp"), split = "//"))[2]
    
    uniqueName <- paste(unlist(strsplit(uniqueName, split = "/")), collapse = "_")
    
    temp.path <- paste(getwd(), "/", uniqueName, sep = "")
    
    dir.create(temp.path)
    
    return(temp.path)
}

################################################ localMinima and localMaxima

localMinima <- function(x) {
    y <- diff(c(.Machine$integer.max, x)) > 0L
    rle(y)$lengths
    y <- cumsum(rle(y)$lengths)
    y <- y[seq.int(1L, length(y), 2L)]
    if (x[[1]] == x[[2]]) {
        y <- y[-1]
    }
    
    return(y)
}

localMaxima <- function(x) {
    y <- diff(c(-.Machine$integer.max, x)) > 0L
    rle(y)$lengths
    y <- cumsum(rle(y)$lengths)
    y <- y[seq.int(1L, length(y), 2L)]
    if (x[[1]] == x[[2]]) {
        y <- y[-1]
    }
    
    return(y)
}

################################################ Calculate quantile threshold per standard deviation

quantSD <- function(sdx, p = NULL) {
    if (is.null(p)) {
        p <- 0.75
    }
    if (quantile(sdx, probs = p) <= median(range(sdx))) {
        thr <- quantile(sdx, probs = p)
    } else {
        thr <- median(sdx)
    }
    return(thr)
}

################################################ AnnotateDECO

AnnotateDECO <- function(ids, id.type, attributes = NA, pack.db = "Homo.sapiens", 
    rm.redundant = TRUE, verbose = FALSE) {
    if (!exists(pack.db)) {
        stop("ERROR: Annotation package '", pack.db, "' is not in library.")
    } else {
        requireNamespace(pack.db, quietly = TRUE)
    }
    
    if (verbose) {
        message(.timestamp(), "-- Annotating IDs with Bioconductor...")
    }
    
    if (all(is.na(attributes))) {
        attributes <- c("SYMBOL", "ENSEMBL", "ENTREZID", "TXCHROM", 
            "GENENAME")
    }
    
    if (id.type == "PROBEID") {
        xx <- as.list(get(paste(unlist(strsplit(pack.db, split = ".db")), 
            "ENSEMBL", sep = "")))
        
        infogenes <- suppressMessages(AnnotationDbi::select(x = get(pack.db), 
            obj = names(xx), columns = c(attributes, id.type), keys = keys(get(pack.db), 
                keytype = "ENSEMBL"), keytype = "ENSEMBL"))
    } else {
        infogenes <- suppressMessages(AnnotationDbi::select(x = get(pack.db), 
            obj = ids, columns = attributes, keys = keys(get(pack.db), keytype = id.type), 
            keytype = id.type))
    }
    
    infogenes <- infogenes[apply(infogenes[, attributes], 1, function(x) all(!is.na(x))), 
        ]
    
    if (all(is.na(infogenes))) {
        stop("Annotation library does not found input IDs.")
    }
    
    infogenes <- infogenes[order(infogenes[, id.type]), ]
    infogenes <- infogenes[infogenes[, id.type] %in% ids, ]
    
    if (rm.redundant) {
        infogenes <- infogenes[!(duplicated(infogenes[, id.type])), ]
        rownames(infogenes) <- infogenes[, id.type]
        
        if (verbose) {
            message(.timestamp(), "-- Removed duplicated info from IDs.")
        }
    }
    return(infogenes)
}

############################## innerProductAssign

innerProductAssign <- function(inner, samples = NA, control = NA, analysis) {
    samples <- samples[names(inner)]
    if (analysis == "Binary") {
        res <- 0
        inner0 <- inner
        for (j in seq_len(length(table(samples)))) {
            if (j == 1) {
                inner <- inner0[names(samples[samples == control])]
            } else {
                inner <- inner0[names(samples[samples != control])]
            }
            
            error <- try(expr = {
                d <- density.lf(x = as.matrix(inner))
            }, silent = TRUE)
            if (inherits(error, "try-error")) {
                d <- density(x = inner)
            }
            x <- d$x[localMaxima(d$y)]
            
            error <- try(expr = {
                p <- kmeans(inner, centers = x)
            }, silent = TRUE)
            if (inherits(error, "try-error")) {
                error <- try(expr = {
                  p <- kmeans(inner, centers = length(x) - 1, nstart = 10)
                }, silent = TRUE)
            }
            
            res <- c(res, p$cluster + max(res))
        }
        res <- res[2:length(res)]
        res <- res[order(res, inner0)]
    } else {
        error <- try(expr = {
            d <- density.lf(x = as.matrix(inner))
        }, silent = TRUE)
        if (inherits(error, "try-error")) {
            d <- density(x = inner)
        }
        x <- d$x[localMaxima(d$y)]
        
        error <- try(expr = {
            p <- kmeans(inner, centers = x)
        }, silent = TRUE)
        if (inherits(error, "try-error")) {
            error <- try(expr = {
                p <- kmeans(inner, centers = length(x), nstart = 10)
            }, silent = TRUE)
        }
        res <- p$cluster[order(p$cluster, inner)]
    }
    return(list(cl = res, centroid = x))
}

############################## Distance function with correlation

distFunc <- function(x, use = "pairwise.complete.obs", cor.method = "pearson") {
    co.x <- cor(x, use = use, method = cor.method)
    dist.co.x <- 1 - co.x
    return(as.dist(dist.co.x))
}


############################## Cophenetic correlation and clustering dendogram

cophDECO <- function(data, method.heatmap = "ward.D", k = NULL, scale = FALSE, 
    verbose = TRUE, coph = TRUE) {
    if (is.null(method.heatmap)) {
        method.heatmap <- c("ward.D", "ward.D2", "single", "complete", "average", 
            "mcquitty", "median", "centroid")
    }
    
    if (scale) {
        data <- scale(data)
    }
    
    d <- distFunc(x = as.matrix(data), cor.method = "pearson")
    d[is.na(d)] <- 0
    
    if (coph) {
        hmet <- vapply(method.heatmap, function(x) {
            sampleTree <- as.dendrogram(hclust(d, method = x))
            coph <- cor(c(d), c(cophenetic(sampleTree)), method = "pearson")
        }, FUN.VALUE = numeric(1))
    } else {
        hmet <- rep(1, length(method.heatmap))
        names(hmet) <- method.heatmap
    }
    
    sampleTree <- hclust(d, method = names(hmet[order(hmet, decreasing = TRUE)])[1])
    sampleTree$height <- sampleTree$height[order(sampleTree$height)]
    
    if (is.null(k)) {
        p <- c()
        for (i in 2:(dim(data)[2] - 1)) {
            cutt <- cutree(sampleTree, k = i)
            p[i - 1] <- .cluster.stat(d = d, clustering = cutt)$pearsongamma
        }
        p <- unlist(na.omit(p))
        cutt <- cutree(sampleTree, k = which(p == max(p)) + 1)
    } else {
        cutt <- cutree(sampleTree, k = k)
        p <- .cluster.stat(d = d, clustering = cutt)$pearsongamma
        if (verbose) {
            message(.timestamp(), "-- NOTE: Number of 'k' subclasses defined
              by user have been settle down.")
        }
    }
    return(list(dend = sampleTree, coph = hmet, cluster = cutt, huber = max(p)))
}


############################## probability subsampling

subsamplingProb <- function(x, iter, n1, n2 = 0) {
    if (n2 > 0) {
        p1o <- x/n1
        p2o <- x/n2
        
        m <- p1o * p2o
        
        r1 <- choose(n1, x)
        r2 <- choose(n2, x)
        
        m1 <- m * p2o * iter
        m2 <- m * p1o * iter
        
        m12 <- m * (p2o * p1o)^(x - 1)
        
        all <- r1 * r2
    } else {
        p1o <- x/n1
        r1 <- choose(n1, x)
        m <- 0
        all <- r1
    }
    return(list(prob.s = m, prob.comb = iter/all, n.all = all))
}


############################## Overlapping omic signal

overlapFeature <- function(id, data, classes, control, analysis, infoS = NA, 
    plot = FALSE) {
    if (analysis == "Binary") {
        da <- density.lf(as.matrix(data[id, names(classes[classes == control])]), 
            from = range(data)[1], to = range(data)[2], n = 1000, width = 1)
        db <- density.lf(as.matrix(data[id, names(classes[classes != control])]), 
            from = range(data)[1], to = range(data)[2], n = 1000, width = 1)
        d <- data.frame(x = da$x, a = da$y, b = db$y)
        
        # calculate intersection densities
        d$w <- pmin(d$a, d$b)
        
        # integrate areas under curves
        total <- integrate.xy(d$x, d$a) + integrate.xy(d$x, d$b)
        intersection <- integrate.xy(d$x, d$w)
        col <- adjustcolor(c("navyblue", "darkred"), alpha.f = 0.3)
        pos <- 2:dim(d)[2]
        txt <- c("Control", "Case")
    }
    if (analysis == "Multiclass") {
        d <- vapply(levels(classes), function(x) {
            error <- try(expr = {
                d <- density.lf(as.matrix(data[id, names(classes[classes == 
                  x])]), from = range(data)[1], to = range(data)[2], n = 1000, 
                  width = 1)$y
            }, silent = TRUE)
            if (inherits(error, "try-error")) {
                d <- density(as.numeric(data[id, names(classes[classes == 
                  x])]), n = 1000)$y
            }
            return(d)
        }, FUN.VALUE = numeric(1000))
        error <- try(expr = {
            x <- density.lf(as.matrix(data[id, names(classes[classes == levels(classes)[1]])]), 
                from = range(data)[1], to = range(data)[2], n = 1000, width = 1)$x
        }, silent = TRUE)
        if (inherits(error, "try-error")) {
            x <- density(as.numeric(data[id, names(classes[classes == levels(classes)[1]])]), 
                n = 1000)$x
        }
        w <- apply(apply(expand.grid(levels(classes), levels(classes)), 1, 
            function(y) pmin(d[, y[1]], d[, y[2]])), 1, max)
        d <- data.frame(d, x = x, w = w)
        total <- sum(vapply(seq_len(length(levels(classes))), function(y) integrate.xy(d$x, 
            d[, y]), FUN.VALUE = numeric(1)))^2
        intersection <- integrate.xy(d$x, d$w)
        
        pos <- seq_len(length(levels(classes)))
        col <- adjustcolor(colorRampPalette(c("navyblue", "darkorange", "darkred", 
            "darkgreen"))(length(levels(classes))), alpha.f = 0.3)
        txt <- levels(classes)
    }
    # compute overlap coefficient
    overlap <- length(levels(classes)) * intersection/total
    
    if (plot) {
        par(mar = c(10, 6, 10, 8))
        stackpoly.2(x = d[, pos], col = col, border = "grey", axis1 = TRUE, 
            stack = FALSE, lwd = 1, lty = 3, xaxlab = round(d[seq(1, dim(d)[1], 
                dim(d)[1] * 0.05), "x"], 1), xat = seq(1, dim(d)[1], dim(d)[1] * 
                0.05), xlab = "Raw Data Signal", ylab = "Density", main = paste("Overlap:", 
                round(overlap, 3), "\nLikelihood density estimation, bandwidth = 1"))
        legend("topright", legend = txt, pch = 15, cex = 1.5, pt.cex = 1.5, 
            col = col, bty = "n")
    }
    
    return(overlap)
}


############################## 
############################## 

.analysisType <- function(deco){
  ## Type of analysis run
  if(length(NSCAcluster(deco)) == 1 & all(is.na(deco@classes)))
    analysis <- "Unsupervised" 
  if(length(NSCAcluster(deco)) == 1 & any(!is.na(deco@classes)))
    analysis <- "Multiclass" 
  if(length(NSCAcluster(deco)) > 1)
    analysis <- "Binary" 
  
  return(analysis)
}


############################## Colors assignment
############################## 

jColor <- function(info) {
    info <- as.matrix(info)
    nam <- rownames(info)
    double <- c("black", "gold", "ivory3", "cornflowerblue", "chocolate", 
        "honeydew1")
    
    colnames(info) <- paste(rev(LETTERS[seq_len(dim(info)[2])]), ": ", colnames(info), 
        sep = "")
    for (j in seq_len(dim(info)[2])) info[, j] <- paste(rev(LETTERS[seq_len(dim(info)[2])])[j], 
        ": ", as.character(info[, j]), sep = "")
    rownames(info) <- nam
    info.sample.color <- info
    info.sample.color <- apply(info.sample.color, 2, as.character)
    if (dim(info)[2] > 1) {
        pos <- apply(info, 2, unique)
        if (is.matrix(pos)) {
            pos <- rep(dim(pos)[1] == 2, dim(pos)[2])
        } else {
            pos <- lapply(pos, length) == 2
        }
    } else {
        pos <- dim(apply(info, 2, unique))[1] <= 2
    }
    ty <- sort(unlist(apply(info[, !pos, drop = FALSE], 2, unique)))
    tyy <- sort(unlist(apply(info[, pos, drop = FALSE], 2, unique)))
    myPalette <- c(RColorBrewer::brewer.pal(name = "Set1", n = 9)[seq_len(6)], "white", 
        RColorBrewer::brewer.pal(name = "Set1", n = 9)[8:9], "black")
    if (length(ty) > 10) {
        myPalette <- colorRampPalette(myPalette)(length(unlist(apply(info, 
            2, unique))))
    }
    if (length(which(pos)) < dim(info)[2]) {
        for (z in seq_len(length(ty))) {
            info.sample.color[info %in% ty[z]] <- as.character(rep(myPalette[z], 
                length(info.sample.color[info == unlist(apply(info, 2, unique))[z]])))
            names(ty)[z] <- unique(as.character(rep(myPalette[z], length(info.sample.color[info == 
                unlist(apply(info, 2, unique))[z]]))))
        }
    }
    rownames(info.sample.color) <- nam
    ty <- c(tyy, ty)
    if (any(pos)) {
        for (j in seq_len((length(which(pos)) * 2))) info.sample.color[info == 
            ty[j]] <- rep(double, 10)[j]
        names(ty)[seq_len(j)] <- rep(double, 10)[seq_len(j)]
    }
    
    return(list(orig = info, col = info.sample.color, ty = ty))
}


################################# Combinatorial calculation considering multiclass experimental design

.combCalcM <- function(data, classes, control, iterations, multi, r, results) {
    if (!(all(names(classes) %in% colnames(data))) | is.null(names(classes))) {
        stop("ERROR: Names of samples in classes
         vector not match with data.")
    }
    
    classes <- sort(as.factor(classes))
    
    # Calculating maximal resampling size per class
    if (is.null(r) || r <= 0) {
      r <- round(sqrt(min(table(classes))), digits = 0)
    }
    if(r < 3)
      r <- 3
    
    # If multiclass vector has been proposed:
    if (length(levels(classes)) > 2) {
        control <- NA
        combMulti <- t(combn(levels(classes), 2))
        
        # Removing auto-combination
        combMulti <- combMulti[combMulti[, 1] != combMulti[, 2], ]
        
        maxSub <- subsamplingProb(x = r, n1 = min(table(classes)), iter = 0)$n.all
        if (iterations == 0) {
            iterations <- maxSub
        }
        if (is.na(maxSub)) {
            stop("ERROR: Resampling size is higher than minimum class size.")
        }
        multIter <- round(iterations/dim(combMulti)[1], 0)
        if (multIter > maxSub) {
            multIter <- maxSub
        }
        message(" NOTE: More than two classes of samples.\n
            Multiclass analysis must run ", 
            dim(combMulti)[1], " (or multiple) iterations at least:\n ", multIter * 
                dim(combMulti)[1], " random iterations (", multIter, " rounds) will be calculated.\n")
        
        c1 <- apply(combMulti, 1, function(y) names(classes[classes == y[1]]))
        c2 <- apply(combMulti, 1, function(y) names(classes[classes == y[2]]))
        combi <- matrix(data = 0, ncol = 2 * r, nrow = 1)
        
        # Calculating combinations of samples without mixing classes.
        for (i in seq_len(multIter)) {
            if (is.list(c1)) {
                combi1 <- matrix(unlist(lapply(c1, function(x) which(colnames(data) %in% 
                  sample(x, size = r, replace = FALSE)))), ncol = r, byrow = TRUE)
                combi2 <- matrix(unlist(lapply(c2, function(x) which(colnames(data) %in% 
                  sample(x, size = r, replace = FALSE)))), ncol = r, byrow = TRUE)
            } else {
                combi1 <- t(apply(c1, 2, function(x) which(colnames(data) %in% 
                  sample(x, size = r, replace = FALSE))))
                combi2 <- t(apply(c2, 2, function(x) which(colnames(data) %in% 
                  sample(x, size = r, replace = FALSE))))
            }
            combi <- rbind(combi, cbind(combi1, combi2))
        }
        results <- cbind(combi[2:dim(combi)[1], ], 
                         nFeatures = c(rep(0, multIter * dim(combMulti)[1])))
        colnames(results) <- c(seq_len(2 * r), "nFeatures")
        n1 <- dim(data)[2]
        n2 <- min(table(classes))
        cl1 <- NA
        cl2 <- NA
        categories.control <- seq_len(dim(data)[2])
        categories.case <- seq_len(dim(data)[2])
        multi <- TRUE
    } else {
        # Binary contrast. 'Control' label defines '0' class for eBayes algorithm.
        if (is.na(control)) {
            cl1 <- levels(classes)[1]
            cl2 <- levels(classes)[2]
            control <- cl1
        } else {
            if (!(control %in% levels(classes))) {
                stop("Control label not found in vector classes provided.")
            }
            cl1 <- control
            cl2 <- levels(classes)[which(levels(classes) != control)]
        }
        n1 <- length(which(classes == cl1))
        n2 <- length(which(classes == cl2))
        classes <- classes[c(which(classes == cl1), which(classes == cl2))]
        
        # Maximum number of iterations
        maxSub <- prod(factorial(c(n1, n2))/(factorial(r) * factorial(c(n1, 
            n2) - r)))
        
        if(is.nan(maxSub)) {
            maxSub <- 1e9
            message(.timestamp(), " -- Maximum number of iterations is defined as ", 
                    maxSub, ".")
        }

        if (iterations > maxSub) {
            iterations <- maxSub
            message(.timestamp(), " -- 'iterations' is higher than
              maximum number of combinations of samples.
              Number of 'iterations' will be changed to ", 
                maxSub, ".")
        }
        data <- data[, names(classes)]
        categories.control <- seq_len(n1)
        categories.case <- (n1 + 1):(n1 + n2)
        message("\n Classes vector defined as input.
            SUPERVISED analysis will be carry out.\n")
    }
    
    return(list(categories.control = categories.control, data = data, categories.case = categories.case, 
        results = results, multi = multi, classes = classes, cl1 = cl1, cl2 = cl2, iterations = iterations,
        n1 = n1, n2 = n2))
}


################################# Combinatorial calculation

.combCalc <- function(iterations, multi, classes, r, results, categories.control, 
    categories.case, n1, n2) {
    cont <- 0
    user <- "n"
    # Asking for number of iterations if it was not defined.
    if (iterations == 0 || is.null(iterations) & !multi) {
        n.all <- subsamplingProb(x = r, iter = 1000, n1 = n1, n2 = n2)$n.all
        user <- readline(prompt = message("\n All possible combinations:", 
            n.all, "\n Run ALL combinations? (y/n):"))
        if (user == "n") {
            iterations <- as.integer(readline(prompt = message("\n Define number of random combinations to calculate:")))
            if (!(is.integer(iterations))) {
                stop("Non-integer argument passed as number of combinations.")
            }
        }
    }
    # Calculate random or all possible combinations between samples if
    # 'binary', 'multiclass' or 'unsupervised' has been proposed.
    if (user == "y" & !multi) {
        allCombControl <- combn(n1, r)
        allCombCase <- combn(n2, r)
        nSamples <- nrow(allCombControl) + nrow(allCombCase)
        ncomb <- ncol(allCombControl) * ncol(allCombCase)
        results <- matrix(nrow = ncomb, ncol = nSamples)
        colnames(results) <- seq_len(nSamples)
        cont <- 1
        
        # Building combination matrix.
        for (iCombCONTROL in seq_len(ncol(allCombControl))) {
            for (iCombCASE in seq_len(ncol(allCombCase))) {
                samples <- c(categories.control[allCombControl[, iCombCONTROL]], 
                  categories.case[allCombCase[, iCombCASE]])
                if (length(which(duplicated(samples))) > 0) {
                  samples[duplicated(samples)] <- sample(which(!(seq_len(n1 + 
                    n2) %in% samples)), 1)
                }
                results[cont, seq_len(nSamples)] <- samples
                cont <- cont + 1
            }
        }
        results <- cbind(results, nFeatures = c(rep(0, ncomb)))
        msg <- paste(.timestamp(), "-- Number of total iterations:", ncomb)
    } else if (!multi) {
        results <- matrix(nrow = iterations, ncol = 2 * r + 1)
        colnames(results) <- c(seq(from = 1, to = 2 * r), "nFeatures")
        # Building combination matrix.
        for (i in seq_len(iterations)) {
            if (all(!(is.na(classes)))) {
                results[i, seq_len(r)] <- sample(categories.control, r, replace = FALSE)
                results[i, (r + 1):(2 * r)] <- sample(categories.case, r, 
                  replace = FALSE)
            } else {
                results[i, seq_len(r * 2)] <- sample(categories.control, 2 * 
                  r, replace = FALSE)
            }
        }
        results[, c("nFeatures")] <- 0
        msg <- paste(.timestamp(), "-- Randomly selected", iterations, "iterations.")
    } else {
        msg <- paste(.timestamp(), "-- Number of total iterations:", dim(results)[1])
    }
    
    message(msg)
    
    return(results)
}

################################# Internal LIMMA calculations

.LIMMAcalc <- function(data, results, classes, control, 
    q.val, call, r, temp.path,  
    bpparam, multi, n1, n2) {
    # Creating final variable containing all subsampling results.
    res <- list(data = as.matrix(data), results = results, subStatFeature = NULL, 
        incidenceMatrix = NULL, classes = classes, resampleSize = r, control = control, 
        pos.iter = 0, q.val = q.val, call = call)
    
    # Some variables definition to parallel subsampling.
    top_eje <- matrix(ncol = 7)
    colnames(top_eje) <- c("logFC", "AveExpr", "t", "P.Value", "adj.P.Val", 
        "B", "ID")
    top_eje[, 2:7] <- 0
    UP <- as.data.frame(data)
    if (all(is.na(classes)) | multi) {
        UP[, seq_len(n1)] <- 0
    } else {
        UP[, seq_len(n1 + n2)] <- 0
    }
    DOWN <- UP
    mx <- UP
    results <- cbind(results, counter = seq(seq_len(dim(results)[1])))
    
    suppressWarnings(limma1 <- bplapply(seq_len(dim(results)[1]), FUN = .limmaSubsamp, 
        BPPARAM = bpparam, results = results[, seq_len(2 * r)], r = r, data = data, 
        q.val = q.val, temp.path = temp.path))
    
    pos <- unlist(lapply(limma1, function(x) x$pos))
    limma1 <- unlist(lapply(limma1, function(x) x$p))
    limma1 <- limma1[!(is.na(limma1))]
    
    results[, "nFeatures"] <- pos
    results <- results[, seq_len(dim(results)[2] - 1)]
    
    # Stop subsampling for non-differential signal.
    if (length(limma1) <= 1) {
        stop("NO-RESULT: Any combination shows DE features with these
          resampling size and p-value threshold.
          Try again with another parameters.")
    }
    
    return(list(res = res, limma1 = limma1, top_eje = top_eje, mx = mx, UP = UP, 
        DOWN = DOWN, results = results))
}


################################# 

.statCalc <- function(counter, tab, top_eje, ncomb, j) {
    res_ <- vector(length = 12)
    names(res_) <- colnames(tab)
    
    lines <- top_eje[as.character(top_eje[, c("ID")]) %in% as.character(tab[counter, 
        c("ID")]), ]
    
    # Statistics from this RDA step.
    if (length(lines[, c("ID")]) > 0) {
        res_[c("Avrg.logFC")] <- .colMeans(lines[, c("logFC")], m = length(lines[, 
            c("ID")]), n = 1)
        res_[c("Best.adj.P.Val")] <- lines[1, c("adj.P.Val")]
        res_[c("Repeats")] <- length(lines[, 1])
        res_[c("FR.Repeats")] <- res_[c("Repeats")]/ncomb
        res_[c("RF.Positive.Repeats")] <- res_[c("Repeats")]/j
        res_[c("Chi.Square")] <- (-2) * sum(log(lines[, c("P.Value")]))
        res_[c("P.Val.ChiSq")] <- 1 - pchisq(as.numeric(res_[c("Chi.Square")]), 
            df = 2 * length(lines[, 1]))
        res_[c("ID")] <- as.character(tab[counter, c("ID")])
    }
    
    return(res_)
}


################################# 

.limmaSubsamp <- function(counter, results, r, data, q.val, 
                          temp.path) {
    # Taking different combinations from previous matrix and running LIMMA.
    CASE <- CONTROL <- NULL
    samples <- results[counter, seq_len(r * 2)]
    design <- cbind(
      CONTROL = rep(c(1, 0), each = r), 
      CASE = rep(c(0, 1), each = r)
    ) 
    fit <- limma::lmFit(data[, samples], design)
    cont.mat <- limma::makeContrasts(CASEvsCONTROL = CASE - CONTROL, 
                                     levels = design)
    fit2 <- limma::contrasts.fit(fit, cont.mat)
    fit3 <- limma::eBayes(fit2)
    top <- limma::topTable(fit3, adjust.method = "fdr", 
                            number = nrow(data), 
                            p.value = q.val)
    pos <- dim(top)[1]
    
    # Generating intermediate files in temporary dir. If any DE feature is
    # found for any combination, no file would be created.
    resultfile <- paste(temp.path, "/", c("diff", "incid"), counter, ".dta", 
        sep = "")
    if (pos > 0) {
        top <- data.frame(top, ID = rownames(top))
        top <- top[seq_len(pos), ]
        # Differential features and incidence matrix files are separated.
        foreign::write.dta(top, file = resultfile[1])
        foreign::write.dta(as.data.frame(samples), file = resultfile[2])
    } else {
        resultfile <- c(NA, NA)
    }
    
    return(list(p = resultfile, pos = pos))
}


################################# Frequency matrix and intermediate results

.freqMatrix <- function(limmaRes, data, n1, n2, r, multi, unsup) {
    mx0 <- limmaRes$mx
    mx20 <- limmaRes$mx
    pb <- txtProgressBar(style = 3, min = 0, max = length(limmaRes$limma1) - 
        1)
    UP <- limmaRes$UP
    DOWN <- limmaRes$DOWN
    top_eje <- limmaRes$top_eje
    
    # Reading and summarizing intermediate results.
    for (i in seq(from = 1, to = length(limmaRes$limma1) - 1, by = 2)) {
        mx <- mx0
        mx2 <- mx20
        resultfile1 <- limmaRes$limma1[i]
        resultfile2 <- limmaRes$limma1[i + 1]
        if (is.matrix(limmaRes$limma1)) {
            resultfile1 <- limmaRes$limma1[1, i]
            resultfile2 <- limmaRes$limma1[2, i]
        }
        
        # Joining all differential event statistics.
        top_eje1 <- foreign::read.dta(file = resultfile1)
        colnames(top_eje1) <- colnames(top_eje)
        rownames(top_eje1) <- make.names(rep(LETTERS, dim(data)[1])[seq_len(dim(top_eje1)[1])], 
            unique = TRUE)
        samples <- as.vector(t(foreign::read.dta(file = resultfile2)))
        counter <- as.numeric(unlist(strsplit(unlist(strsplit(resultfile1, 
            split = "diff"))[2], split = ".", fixed = TRUE))[1])
        
        # Generating global incidence matrix with UP and DOWN events.
        mx[rownames(mx) %in% top_eje1[top_eje1[, c("logFC")] < 0, c("ID")], 
            samples[seq_len(r)]] <- mx[rownames(mx) %in% top_eje1[top_eje1[, 
            c("logFC")] < 0, c("ID")], samples[seq_len(r)]] + 1
        mx[rownames(mx) %in% top_eje1[top_eje1[, c("logFC")] > 0, c("ID")], 
            samples[(r + 1):(r * 2)]] <- mx[rownames(mx) %in% top_eje1[top_eje1[, 
            c("logFC")] > 0, c("ID")], samples[(r + 1):(r * 2)]] + 1
        mx2[rownames(mx2) %in% top_eje1[top_eje1[, c("logFC")] < 0, c("ID")], 
            samples[(r + 1):(r * 2)]] <- mx2[rownames(mx2) %in% top_eje1[top_eje1[, 
            c("logFC")] < 0, c("ID")], samples[(r + 1):(r * 2)]] + 1
        mx2[rownames(mx2) %in% top_eje1[top_eje1[, c("logFC")] > 0, c("ID")], 
            samples[seq_len(r)]] <- mx2[rownames(mx2) %in% top_eje1[top_eje1[, 
            c("logFC")] > 0, c("ID")], samples[seq_len(r)]] + 1
        
        UP <- UP + mx
        DOWN <- DOWN + mx2

        top_eje <- rbind(top_eje, top_eje1)
        rownames(top_eje1) <- NULL
        setTxtProgressBar(pb, i)
    }
    
    close(pb)
    
    UP <- UP[rowSums(UP) > 0, ]
    DOWN <- DOWN[rowSums(DOWN) > 0, ]
    MULTI <- UP + DOWN
    
    # Making up incidence matrix with separated UP, DOWN and MIXED events.
    if (!unsup & !multi) {
        both.feat <- names(which(rowSums(UP[, seq_len(n1)]) > 0 & rowSums(UP[, 
            (n1 + 1):(n1 + n2)]) > 0))
        if (length(both.feat) > 0) {
            both.feat.mx <- UP[rowSums(UP) > 0, ][rownames(UP) %in% both.feat, 
                ]
            rownames(both.feat.mx) <- paste(rownames(both.feat.mx), sep = "deco", 
                "UP")
            UP <- UP[!(rownames(UP) %in% both.feat), ]
            DOWN <- DOWN[!(rownames(DOWN) %in% both.feat), ]
        } else {
            both.feat.mx <- matrix(0, nrow = 1, ncol = ncol(UP))
        }
    } else {
        both.feat <- NA
        both.feat.mx <- NA
    }
    
    # Vector containing direction of differential event if 'binary'
    UpDw <- c(rep("UP", length(rownames(UP)[rowSums(UP[, seq_len(n1)]) == 
        0])), rep("DOWN", length(rownames(UP)[rowSums(UP[, seq_len(n1)]) > 
        0])), rep("MIXED", length(both.feat)))
    names(UpDw) <- c(rownames(UP)[rowSums(UP[, seq_len(n1)]) == 0], rownames(UP)[rowSums(UP[, 
        seq_len(n1)]) > 0], both.feat)
    rownames(UP) <- paste(rownames(UP), sep = "deco", "UP")
    rownames(MULTI) <- paste(rownames(MULTI), sep = "deco", "UP")
    rownames(DOWN) <- paste(rownames(DOWN), sep = "deco", "DOWN")
    
    return(list(top_eje = top_eje, UP = UP, DOWN = DOWN, MULTI = MULTI, UpDw = UpDw, 
        both.feat = both.feat, both.feat.mx = both.feat.mx))
}


################################# Function for filtering features by Repeats

.repThr <- function(sub, rep.thr, samp.perc) {
    g.names <- vapply(rownames(sub$incidenceMatrix), function(x) unlist(strsplit(x, 
        split = "deco", fixed = TRUE))[1], FUN.VALUE = character(1))
    
    if (all(is.na(sub$classes)) || length(levels(sub$classes)) > 2) {
        f <- apply(sub$incidenceMatrix, 1, function(x) length(which(x >= rep.thr)))
        names(f) <- g.names[names(g.names) %in% names(f)]
    } else {
        f <- vapply(unique(g.names), function(x) sum(apply(sub$incidenceMatrix[grepl(rownames(sub$incidenceMatrix), 
            pattern = x, fixed = TRUE), ], 1, function(x) length(which(x >= 
            rep.thr)))), numeric(1))
    }
    
    # 'Repeats' threshold.  All features under those thresholds will be
    # removed.
    if (length(which(f > samp.perc * dim(sub$incidenceMatrix)[2])) > 0) {
        message(paste(" NOTE: Repeats index threshold have been applied:\n", 
            length(which(f <= samp.perc * dim(sub$incidenceMatrix)[2])), "features removed from", 
            dim(sub$subStatFeature)[1], "total."))
    }
    sub$subStatFeature <- sub$subStatFeature[names(f[which(f > samp.perc * 
        dim(sub$incidenceMatrix)[2])]), ]
    sub$subStatFeature <- cbind(sub$subStatFeature, Repeats.index = f[names(f) %in% 
        rownames(sub$subStatFeature)]/dim(sub$incidenceMatrix)[2] * 100)
    sub$subStatFeature <- sub$subStatFeature[order(rownames(sub$subStatFeature), 
        decreasing = TRUE), ]
    
    # Removing features from incidenceMatrix with lower 'Repeats' than
    # threshold.
    sub$incidenceMatrix <- sub$incidenceMatrix[rownames(sub$incidenceMatrix) %in% 
        names(g.names[g.names %in% as.character(sub$subStatFeature$ID)]), 
        ]
    if (length(which(f > samp.perc * dim(sub$incidenceMatrix)[2])) < 10) {
        stop("After applying 'Repeats' filter, there are not enough features
         (10 features minimum) to input NSCA.")
    }
    
    return(list(sub = sub, g.names = g.names))
}


################################# Function to call overlapFeature

overlapC <- function(sub, cl1, bpparam) {
    message(.timestamp(), " -- Calculating overlapping signal per feature...")
    
    ## Calculating overlap...
    suppressWarnings(overlap <- bplapply(X = rownames(sub$subStatFeature), 
        FUN = overlapFeature, BPPARAM = bpparam, data = sub$data, classes = sub$classes, 
        control = cl1, analysis = "Binary"))
    overlap <- unlist(overlap)
    names(overlap) <- rownames(sub$subStatFeature)
    
    ## Type of profile...
    prof <- vapply(overlap, function(x) max(which(c(0, 0.2, 0.4, 0.75) <= 
        x)), FUN.VALUE = numeric(1))
    
    profile <- prof
    profile[sub$subStatFeature[names(profile), c("UpDw")] == "MIXED" | prof == 4] <- "Mixed"
    profile[prof == 3] <- "Minority"
    profile[prof == 2] <- "Majority"
    profile[prof == 1] <- "Complete"
    names(profile) <- names(overlap)
    profile <- profile[order(names(profile))]
    
    return(list(overlap = overlap, profile = profile))
}


################################# Intern R function of decoReport

.formattingObj <- function(deco, deco0, analysis, sub, info.size) {
    ## Formatting R objects to print.
    if (analysis == "Binary") {
        # p.value from NSCA analysis.
        p.val <- c(deco@NSCAcluster$Control$NSCA$P.Value, deco@NSCAcluster$Case$NSCA$P.Value)
        
        # General information about subclasses found.
        infoSubclass <- rbind(deco@NSCAcluster$Control$infoSubclass, deco@NSCAcluster$Case$infoSubclass)
        rel <- c(rep(p.val[1] <= 0.05, dim(deco@NSCAcluster$Control$infoSubclass)[1]), 
            rep(p.val[2] <= 0.05, dim(deco@NSCAcluster$Case$infoSubclass)[1]))
        infoSubclass <- data.frame(infoSubclass, Binary = rep(0, dim(infoSubclass)[1]), 
            isRelevant = as.character(rel))
        infoSubclass[vapply(rownames(infoSubclass), function(x) unlist(strsplit(x, 
            split = " Subclass"))[1], character(1)) != deco@control, "Binary"] <- 1
        
        # Sample membership to subclasses.
        samplesSubclass <- rbind(cbind(deco@NSCAcluster$Control$samplesSubclass[order(deco@NSCAcluster$Control$samplesSubclass[, 
            1]), ]), cbind(deco@NSCAcluster$Case$samplesSubclass[order(deco@NSCAcluster$Case$samplesSubclass[, 
            1]), ]))
        
        samplesSubclass <- cbind(Samples = rownames(samplesSubclass), Subclass = samplesSubclass[, 
            1])
        rownames(samplesSubclass) <- seq_len(length(rownames(samplesSubclass)))
        
        # NSCA and hierarchical clustering information.
        nsc <- matrix(round(c(deco@NSCAcluster$Control$var, deco@NSCAcluster$Case$var, 
            p.val, deco@NSCAcluster$Control$hclustSamp$huber, deco@NSCAcluster$Case$hclustSamp$huber), 
            digits = 3), nrow = 3, byrow = TRUE)
        colnames(nsc) <- c("Control samples", "Case samples")
        rownames(nsc) <- c("Variability explained by NSCA", "NSCA C-statistic p.value", 
            "Huber's gamma")
        
        # Assignment of colors to all subclasses. It will be conserved along all
        # the report.
        count <- table(vapply(rownames(infoSubclass), function(x) unlist(strsplit(x, 
            split = " Subclass"))[1], character(1)) != deco@control)
        color.cluster <- c(colorRampPalette(c("navyblue", "lightblue"))(count[1]), 
            colorRampPalette(c("orange", "darkred"))(count[2]))
        names(color.cluster) <- rownames(infoSubclass)
        
        # Columns of 'featureTable' to be printed.
        names.col <- c("ID", "SYMBOL", "UpDw", "Profile", "overlap.Ctrl.Case", 
            "Standard.Chi.Square", "Repeats", "Repeats.index", "delta.signal", 
            "sd.Ctrl", "Dendrogram.group.Ctrl", "h.Range.Ctrl", "sd.Case", 
            "Dendrogram.group.Case", "h.Range.Case")
        
        deco@featureTable[, names.col[!names.col %in% c("ID", "SYMBOL", "UpDw", 
            "Profile")]] <- apply(deco@featureTable[, names.col[!names.col %in% 
            c("ID", "SYMBOL", "UpDw", "Profile")]], 2, as.numeric)
        
        textF <- deco@featureTable[, colnames(deco@featureTable) %in% names.col]
        deco0@featureTable <- deco@featureTable
        textF <- textF[, na.omit(match(names.col, colnames(textF)))]
        
        # Chi, Delta , Repeats & SD(inverse)
        ordG <- order(rowMedians(apply(cbind(textF$Standard.Chi.Square, -textF$overlap.Ctrl.Case, 
            -textF$overlap.Ctrl.Case, textF$Repeats, abs(textF$delta.signal), 
            abs(textF$delta.signal), -textF$sd.Ctrl, -textF$sd.Case), 2, function(x) rank(-x, 
            ties.method = "max"))))
        # Chi, Delta & SD
        ordS <- rank(-rowMedians(apply(cbind(textF$Standard.Chi.Square, textF$Standard.Chi.Square, 
            abs(textF$delta.signal), textF$Repeats, abs(textF$delta.signal), 
            textF$h.Range.Ctrl, textF$h.Range.Case), 2, function(x) rank(-x, 
            ties.method = "max"))))
        
        # Saving initial modified 'featureTable' as R data.frame called 'textF0'.
        deco@featureTable <- deco@featureTable[ordG, ]
        textF0 <- textF
        textF <- textF[ordG, ]
        
        # Object defining samples membership to classes previously compared.
        classes <- unlist(lapply(deco@NSCAcluster, function(x) unlist(strsplit(x$samplesSubclass[1], 
            split = " Subclass"))[1]))
    } else {
        # p.value from NSCA analysis.
        p.val <- deco@NSCAcluster$All$NSCA$P.Value
        
        # General information about subclasses found.
        infoSubclass <- deco@NSCAcluster$All$infoSubclass
        rel <- rep(p.val[1] <= 0.05, dim(deco@NSCAcluster$All$infoSubclass)[1])
        infoSubclass <- data.frame(infoSubclass, isRelevant = as.character(rel))
        
        # Sample membership to subclasses.
        samplesSubclass <- cbind(deco@NSCAcluster$All$samplesSubclass[order(deco@NSCAcluster$All$samplesSubclass[, 
            1]), ])
        
        samplesSubclass <- cbind(Samples = rownames(samplesSubclass), Subclass = samplesSubclass[, 
            1])
        rownames(samplesSubclass) <- seq_len(length(rownames(samplesSubclass)))
        
        # NSCA and hierarchical clustering information.
        nsc <- round(rbind(deco@NSCAcluster$All$var, p.val, deco@NSCAcluster$All$hclustSamp$huber), 
            3)
        colnames(nsc) <- c("All samples")
        rownames(nsc) <- c("Variability explained by NSCA", "p.value", "Huber's gamma")
        
        # Assignment of colors to all subclasses. It will be conserved along all
        # the report.
        color.cluster <- colorRampPalette(c("greenyellow", "lightblue", "darkgreen", 
            "darkred"))(dim(infoSubclass)[1])
        names(color.cluster) <- rownames(infoSubclass)
        
        # Columns of 'featureTable' to be printed.
        names.col <- c("ID", "SYMBOL", "Standard.Chi.Square", "Repeats", "Repeats.index", 
            "Avrg.logFC", "Dendrogram.group", "h", "sd", "h.Range")
        
        
        ind <- colnames(deco@featureTable) %in% names.col[!names.col %in% 
            c("ID", "SYMBOL")]
        deco@featureTable[, ind] <- apply(deco@featureTable[, ind], 2, as.numeric)
        
        textF <- deco@featureTable[, colnames(deco@featureTable) %in% names.col]
        textF <- textF[, na.omit(match(names.col, colnames(textF)))]
        
        ordG <- order(rowMedians(apply(cbind(textF$Standard.Chi.Square, textF$Standard.Chi.Square, 
            textF$Repeats, textF$Repeats, textF$h.Range), 2, function(x) rank(-x, 
            ties.method = "max"))))
        ord <- ordS <- ordG
        
        # No sense to make two different ranking for 'Unsupervised' analysis.
        deco@featureTable <- deco@featureTable[ord, ]
        textF0 <- textF
        textF <- textF[ord, ][seq_len(info.size), ]
        classes <- sub$classes
    }
    return(list(deco = deco, p.val = p.val, color.cluster = color.cluster, 
        infoSubclass = infoSubclass, ordG = ordG, ordS = ordS, rel = rel, 
        classes = classes, nsc = nsc, textF = textF, textF0 = textF0, names.col = names.col, 
        samplesSubclass = samplesSubclass))
}


################################# 

NSCACluster <- function(mx, data = NULL, id.names = NULL, k = NULL, label = NA, 
    v = 80, method.dend = NULL, raw = NULL, UpDw = NULL, dir = NA) {
    if (is.null(id.names)) 
        id.names <- rownames(mx)
    
    ## Considering only UP for "mixed" profiles
    if(all(!is.null(UpDw))) {
      
      mixed <- names(which(table(unname(vapply(rownames(mx), function(x) 
        unlist(strsplit(x, split = "deco"))[1], "character"))) > 1))
      
      if(length(mixed) > 0)
        mx <- mx[!rownames(mx) %in% paste(mixed, "decoDOWN", sep = ""),]
    }
    
    ## Calculating NSCA...
    ca_res <- NSCA(as.matrix(mx[rowSums(mx) > 0, colSums(mx) > 0]), v = v)
    nd <- min(which(ca_res$Inertia[, 3] >= v))
    if (nd == 1) {
        nd <- 2
        message(.timestamp(), "-- 1D is enough to reach explained variability from input.
            2D will be considered")
    }
    ca_ev <- rbind(ca_res$fbip[, seq_len(nd)], ca_res$g[, seq_len(nd)])
    
    # Renaming feature IDs
    if (all(!is.null(id.names))) {
        rownames(ca_ev)[rownames(ca_ev) %in% names(id.names)] <- as.character(id.names[rownames(ca_ev)[rownames(ca_ev) %in% 
            names(id.names)]])
    }
    
    # Calculating h-statistic
    if (all(!is.null(raw)) & all(!is.null(data))) {
        dispersion <- data[id.names[rownames(ca_res$inner.prod)], colnames(ca_res$inner.prod)] - 
            raw[id.names[rownames(ca_res$inner.prod)]]
        
        # Dispersion inversion for DOWN regulated within a category of samples
        if (all(!is.null(UpDw))) {
            dispersion[rownames(dispersion) %in% id.names[id.names %in% names(UpDw[UpDw == 
                dir])], ] <- -dispersion[rownames(dispersion) %in% id.names[id.names %in% 
                names(UpDw[UpDw == dir])], ]
        }
        
        inner <- as.matrix(ca_res$inner.prod)
        inner[] <- scale(as.vector(inner))
        dispersion[] <- scale(as.vector(dispersion))
        
        # Make positive ranges
        inner <- inner + abs(min(inner)) + 1
        rownames(dispersion) <- rownames(inner)
        ca_res$inner.prod <- as.matrix(inner * dispersion - mean(as.matrix(inner * 
            dispersion)))
        
        # 'h' inversion for DOWN regulated within a category of samples
        if (all(!is.null(UpDw))) {
            ca_res$inner.prod[rownames(ca_res$inner.prod) %in% names(id.names)[id.names %in% 
                names(UpDw[UpDw == dir])], ] <- -ca_res$inner.prod[rownames(ca_res$inner.prod) %in% 
                names(id.names)[id.names %in% names(UpDw[UpDw == dir])], ]
        }
    } else if (all(!is.null(data))) {
        inner <- as.matrix(ca_res$inner.prod)
        inner[] <- scale(as.vector(inner))
        dispersion <- data[id.names[rownames(ca_res$inner.prod)], colnames(ca_res$inner.prod)] - 
            rowMeans(data[id.names[rownames(ca_res$inner.prod)], colnames(ca_res$inner.prod)])
        dispersion[] <- scale(as.vector(dispersion))
        
        # Make positive ranges
        inner <- inner + abs(min(inner)) + 1
        
        # dispersion <- dispersion + abs(min(dispersion)) + 1
        ca_res$inner.prod <- as.matrix(inner * dispersion - mean(as.matrix(inner * 
            dispersion)))
    } else {
        message("Data has not been provided, then 'h statistic' will be not computed.
        Inner product from NSCA\n
        would be used in biclustering of features and samples.")
    }
    # Make sure assignment
    rownames(ca_res$inner.prod) <- rownames(inner)
    
    # Biclustering
    info.dend.samp <- cophDECO(data = as.matrix(ca_res$inner.prod), method.heatmap = method.dend, 
        k = k, scale = FALSE, coph = TRUE)
    info.dend.feat <- cophDECO(data = as.matrix(t(ca_res$inner.prod)), method.heatmap = "ward.D", 
        k = 10, scale = FALSE, verbose = FALSE, coph = FALSE)
    k <- max(info.dend.samp$cluster)
    
    # Calculating some statistics...
    kk <- gg1 <- gg2 <- ss <- vector(length = k)
    message(.timestamp(), "-- Optimized for ", k, " subclasses.")
    
    info <- vapply(seq_len(k), function(y) rowMeans(cbind(as.matrix(ca_res$inner.prod)[, 
        info.dend.samp$cluster == y])), FUN.VALUE = numeric(dim(ca_res$inner.prod)[1]))
    info <- cbind(info, ca_res$di[rownames(info)])
    
    if (k > 1) 
        info <- as.data.frame(cbind(info, matrix(nrow = dim(info)[1], ncol = 3))) 
    else 
      info <- as.data.frame(t(rbind(info, matrix(nrow = dim(info)[1], ncol = 3))))
    
    colnames(info) <- c(paste("Scl", seq_len(k), sep = ""), "Tau.feature", 
        "Closer.subclass", "h.Best", "ID")
    info <- info[order(rownames(info), decreasing = TRUE), ]
    
    if (k > 1) {
        info[, c("Closer.subclass")] <- apply(info[, seq_len(k)], 1, function(x) which(abs(x) == 
            max(abs(x)))[1])
    } else {
        info[, c("Closer.subclass")] <- rep(1, dim(info)[1])
    }
    
    for (i in seq_len(k)) {
        kk[i] <- length(which(colnames(mx) %in% names(info.dend.samp$cluster[info.dend.samp$cluster == 
            i])))
        gg1[i] <- length(which(info[which(vapply(rownames(info), function(x) unlist(strsplit(x, 
            split = "deco", fixed = TRUE))[2], character(1)) == "UP"), "Closer.subclass"] == 
            i))
        gg2[i] <- length(which(info[which(vapply(rownames(info), function(x) unlist(strsplit(x, 
            split = "deco", fixed = TRUE))[2], character(1)) == "DOWN"), "Closer.subclass"] == 
            i))
        p <- abs(info[info[, "Closer.subclass"] == i, i])
        ss[i] <- mean(p[order(p, decreasing = TRUE)][seq_len((length(p) * 
            0.05))])
    }
    if (all(!is.na(label))) {
        infoSubclass <- data.frame(Samples = kk, FeaturesUP = gg1, FeaturesDOWN = gg2, 
            average.hStat.perc95 = ss, row.names = paste(label, "Subclass", seq(from = 1, 
                to = k), sep = " "))
    } else {
        infoSubclass <- data.frame(Samples = kk, FeaturesUP = gg1, FeaturesDOWN = gg2, 
            average.hStat.perc95 = ss, row.names = paste("Subclass", seq(from = 1, to = k), 
                sep = " "))
    }
    if (k > 1) {
        info[, "h.Best"] <- apply(info[, c(paste("Scl", seq_len(k), sep = ""), 
            "Closer.subclass")], 1, function(x) x[x[k + 1]])
    } else {
        info[, "h.Best"] <- info[, 1]
    }
    
    # Renaming some feature IDs and removing duplicated features.
    info[, c("ID")] <- as.character(id.names[rownames(info)])
    rownames(ca_ev)[rownames(ca_ev) %in% rownames(mx)][!(duplicated(id.names[rownames(ca_ev[rownames(ca_ev) %in% 
        rownames(mx), ])]) | duplicated(id.names[rownames(ca_ev[rownames(ca_ev) %in% 
        rownames(mx), ])], fromLast = TRUE))] <- id.names[rownames(ca_ev[rownames(ca_ev) %in% 
        rownames(mx), ])][!(duplicated(id.names[rownames(ca_ev[rownames(ca_ev) %in% 
        rownames(mx), ])]) | duplicated(id.names[rownames(ca_ev[rownames(ca_ev) %in% 
        rownames(mx), ])], fromLast = TRUE))]
    info <- info[!rownames(info) %in% rownames(info[info[, c("ID")] %in% info[duplicated(info[, 
        c("ID")]), c("ID")], ])[which(unlist(strsplit(rownames(info[info[, 
        c("ID")] %in% info[duplicated(info[, c("ID")]), c("ID")], ]), split = "deco", 
        fixed = TRUE)) == "DOWN")/2], ]
    rownames(ca_res$inner.prod)[!(duplicated(id.names[rownames(ca_res$inner.prod)]) | 
        duplicated(id.names[rownames(ca_res$inner.prod)], fromLast = TRUE))] <- id.names[rownames(ca_res$inner.prod)][!(duplicated(id.names[rownames(ca_res$inner.prod)]) | 
        duplicated(id.names[rownames(ca_res$inner.prod)], fromLast = TRUE))]

    # Calculating h statistic per feature per subclass of samples.
    ord <- seq_len(length(info.dend.feat$cluster))
    names(ord) <- rownames(ca_res$inner.prod)[rev(info.dend.feat$dend$order)]
    patt <- info.dend.feat$cluster
    for (i in seq_len(max(info.dend.feat$cluster))) patt[info.dend.feat$cluster == 
        i] <- which(order(vapply(seq_len(max(info.dend.feat$cluster)), function(x) mean(which(info.dend.feat$cluster[rev(info.dend.feat$dend$order)] == 
        x)), numeric(1))) == i)
    info.dend.feat$cluster <- patt
    info <- data.frame(info, h.Range = apply(info[, seq_len(k)], 1, function(x) diff(range(x))), 
        Dendrogram.group = info.dend.feat$cluster[rownames(info)], Dendrogram.order = ord[rownames(info)])
    rownames(info) <- as.character(info[, c("ID")])
    rankingH <- cbind(t(gdata::interleave(t(apply(info[, seq_len(k)], 2, function(x) rank(-abs(x), 
        ties.method = "max"))), t(info[, seq_len(k)]))), info[, "h.Range"], 
        info[, "Dendrogram.group"], ord[rownames(info)])
    colnames(rankingH) <- c(paste(rep(c("Ranking", "h"), k), rep(paste("Scl", 
        seq_len(k), sep = ""), each = 2), sep = "."), "h.Range", "Dendrogram.group", 
        "Dendrogram.order")
    
    # Find out sample membership to any subclass.
    samplesSubclass <- cbind(info.dend.samp$cluster)
    colnames(samplesSubclass) <- "Subclass"
    for (i in seq_len(dim(samplesSubclass)[1])) samplesSubclass[i, 1] <- rownames(infoSubclass)[as.numeric(samplesSubclass[i, 
        1])]
    
    # Returning results.
    if (all(!is.null(data))) {
        names(ca_res)[names(ca_res) == "inner.prod"] <- "h"
        results <- list(ca_res, info, rankingH, infoSubclass, mx, ca_ev, samplesSubclass, 
            ca_res$Inertia[nd, 3], method.dend, info.dend.samp, info.dend.feat, 
            inner, dispersion)
        names(results) <- c("NSCA", "infoFeature", "rankingFeature.h", "infoSubclass", 
            "incidenceMatrix", "Biplot.coordinates", "samplesSubclass", "var", 
            "clus.method", "hclustSamp", "hclustFeat", "inner", "dispersion")
    } else {
        results <- list(ca_res, info, rankingH, infoSubclass, mx, ca_ev, samplesSubclass, 
            ca_res$Inertia[nd, 3], method.dend, info.dend.samp, info.dend.feat)
        names(results) <- c("NSCA", "infoFeature", "rankingFeature.h", "infoSubclass", 
            "incidenceMatrix", "Biplot.coordinates", "samplesSubclass", "var", 
            "clus.method", "hclustSamp", "hclustFeat")
    }
    
    names(results$var) <- "Variability explained by NSCA"
    message(.timestamp(), "-- Summary of clustering analysis:")
    
    print(infoSubclass)
    print(results$var)
    
    return(results)
}


############################## 

.cluster.stat <- function(d = NULL, clustering, alt.clustering = NULL, noisecluster = FALSE, 
    silhouette = TRUE, G2 = FALSE, G3 = FALSE, wgap = TRUE, sepindex = TRUE, 
    sepprob = 0.1, sepwithnoise = TRUE, compareonly = FALSE, aggregateonly = FALSE) {
    if (!is.null(d)) {
        d <- as.dist(d)
    }
    cn <- max(clustering)
    clusteringf <- as.factor(clustering)
    clusteringl <- levels(clusteringf)
    cnn <- length(clusteringl)
    if (cn != cnn) {
        warning("clustering renumbered because maximum != number of clusters")
        for (i in seq_len(cnn)) clustering[clusteringf == clusteringl[i]] <- i
        cn <- cnn
    }
    n <- length(clustering)
    noisen <- 0
    cwn <- cn
    if (noisecluster) {
        noisen <- sum(clustering == cn)
        cwn <- cn - 1
    }
    diameter <- average.distance <- median.distance <- separation <- average.toother <- cluster.size <- within.dist <- between.dist <- numeric(0)
    for (i in seq_len(cn)) cluster.size[i] <- sum(clustering == i)
    pk1 <- cluster.size/n
    pk10 <- pk1[pk1 > 0]
    h1 <- -sum(pk10 * log(pk10))
    corrected.rand <- vi <- NULL
    if (!is.null(alt.clustering)) {
        choose2 <- function(v) {
            out <- numeric(0)
            for (i in seq_len(length(v))) out[i] <- ifelse(v[i] >= 2, choose(v[i], 
                2), 0)
            out
        }
        cn2 <- max(alt.clustering)
        clusteringf <- as.factor(alt.clustering)
        clusteringl <- levels(clusteringf)
        cnn2 <- length(clusteringl)
        if (cn2 != cnn2) {
            warning("alt.clustering renumbered because maximum != number of clusters")
            for (i in seq_len(cnn2)) alt.clustering[clusteringf == clusteringl[i]] <- i
            cn2 <- cnn2
        }
        nij <- table(clustering, alt.clustering)
        dsum <- sum(choose2(nij))
        cs2 <- numeric(0)
        for (i in seq_len(cn2)) cs2[i] <- sum(alt.clustering == i)
        sum1 <- sum(choose2(cluster.size))
        sum2 <- sum(choose2(cs2))
        pk2 <- cs2/n
        pk12 <- nij/n
        corrected.rand <- (dsum - sum1 * sum2/choose2(n))/((sum1 + sum2)/2 - 
            sum1 * sum2/choose2(n))
        pk20 <- pk2[pk2 > 0]
        h2 <- -sum(pk20 * log(pk20))
        icc <- 0
        for (i in seq_len(cn)) for (j in seq_len(cn2)) if (pk12[i, j] > 0) {
            icc <- icc + pk12[i, j] * log(pk12[i, j]/(pk1[i] * pk2[j]))
        }
        vi <- h1 + h2 - 2 * icc
    }
    if (compareonly) {
        out <- list(corrected.rand = corrected.rand, vi = vi)
    } else {
        dmat <- as.matrix(d)
        within.cluster.ss <- 0
        overall.ss <- nonnoise.ss <- sum(d^2)/n
        if (noisecluster) {
            nonnoise.ss <- sum(as.dist(dmat[clustering <= cwn, clustering <= 
                cwn])^2)/sum(clustering <= cwn)
        }
        ave.between.matrix <- separation.matrix <- matrix(0, ncol = cn, nrow = cn)
        di <- list()
        for (i in seq_len(cn)) {
            cluster.size[i] <- sum(clustering == i)
            di <- as.dist(dmat[clustering == i, clustering == i])
            if (i <= cwn) {
                within.cluster.ss <- within.cluster.ss + sum(di^2)/cluster.size[i]
                within.dist <- c(within.dist, di)
            }
            if (length(di) > 0) {
                diameter[i] <- max(di)
            } else {
                diameter[i] <- NA
            }
            average.distance[i] <- mean(di)
            median.distance[i] <- median(di)
            bv <- numeric(0)
            for (j in seq_len(cn)) {
                if (j != i) {
                  sij <- dmat[clustering == i, clustering == j]
                  bv <- c(bv, sij)
                  if (i < j) {
                    separation.matrix[i, j] <- separation.matrix[j, i] <- min(sij)
                    ave.between.matrix[i, j] <- ave.between.matrix[j, i] <- mean(sij)
                    if (i <= cwn & j <= cwn) {
                      between.dist <- c(between.dist, sij)
                    }
                  }
                }
            }
            separation[i] <- min(bv)
            average.toother[i] <- mean(bv)
        }
        average.between <- mean(between.dist)
        average.within <- mean(within.dist)
        nwithin <- length(within.dist)
        nbetween <- length(between.dist)
        between.cluster.ss <- nonnoise.ss - within.cluster.ss
        ch <- between.cluster.ss * (n - noisen - cwn)/(within.cluster.ss * 
            (cwn - 1))
        clus.avg.widths <- avg.width <- NULL
        if (silhouette) {
            sii <- cluster::silhouette(clustering, dmatrix = dmat)
            sc <- summary(sii)
            clus.avg.widths <- sc$clus.avg.widths
            if (noisecluster) {
                avg.width <- mean(sii[clustering <= cwn, 3])
            } else {
                avg.width <- sc$avg.width
            }
        }
        g2 <- g3 <- cn2 <- cwidegap <- widestgap <- sindex <- NULL
        if (G2) {
            splus <- sminus <- 0
            for (i in seq_len(nwithin)) {
                splus <- splus + sum(within.dist[i] < between.dist)
                sminus <- sminus + sum(within.dist[i] > between.dist)
            }
            g2 <- (splus - sminus)/(splus + sminus)
        }
        if (G3) {
            sdist <- sort(c(within.dist, between.dist))
            sr <- nwithin + nbetween
            dmin <- sum(sdist[seq_len(nwithin)])
            dmax <- sum(sdist[(sr - nwithin + 1):sr])
            g3 <- (sum(within.dist) - dmin)/(dmax - dmin)
        }
        pearsongamma <- cor(c(within.dist, between.dist), c(rep(0, nwithin), 
            rep(1, nbetween)))
        dunn <- min(separation[seq_len(cwn)])/max(diameter[seq_len(cwn)], 
            na.rm = TRUE)
        acwn <- ave.between.matrix[seq_len(cwn), seq_len(cwn)]
        dunn2 <- min(acwn[upper.tri(acwn)])/max(average.distance[seq_len(cwn)], 
            na.rm = TRUE)
        if (wgap) {
            cwidegap <- rep(0, cwn)
            for (i in seq_len(cwn)) if (sum(clustering == i) > 1) {
                cwidegap[i] <- max(hclust(as.dist(dmat[clustering == i, clustering == 
                  i]), method = "single")$height)
            }
            widestgap <- max(cwidegap)
        }
        if (sepindex) {
            psep <- rep(NA, n)
            if (sepwithnoise | !noisecluster) {
                for (i in seq_len(n)) psep[i] <- min(dmat[i, clustering != 
                  clustering[i]])
                minsep <- floor(n * sepprob)
            } else {
                dmatnn <- dmat[clustering <= cwn, clustering <= cwn]
                clusteringnn <- clustering[clustering <= cwn]
                for (i in seq_len((n - noisen))) psep[i] <- min(dmatnn[i, 
                  clusteringnn != clusteringnn[i]])
                minsep <- floor((n - noisen) * sepprob)
            }
            sindex <- mean(sort(psep)[seq_len(minsep)])
        }
        if (!aggregateonly) {
            out <- list(n = n, cluster.number = cn, cluster.size = cluster.size, 
                min.cluster.size = min(cluster.size[seq_len(cwn)]), noisen = noisen, 
                diameter = diameter, average.distance = average.distance, 
                median.distance = median.distance, separation = separation, 
                average.toother = average.toother, separation.matrix = separation.matrix, 
                ave.between.matrix = ave.between.matrix, average.between = average.between, 
                average.within = average.within, n.between = nbetween, n.within = nwithin, 
                max.diameter = max(diameter[seq_len(cwn)], na.rm = TRUE), 
                min.separation = sepwithnoise * min(separation) + (!sepwithnoise) * 
                  min(separation[seq_len(cwn)]), within.cluster.ss = within.cluster.ss, 
                clus.avg.silwidths = clus.avg.widths, avg.silwidth = avg.width, 
                g2 = g2, g3 = g3, pearsongamma = pearsongamma, dunn = dunn, 
                dunn2 = dunn2, entropy = h1, wb.ratio = average.within/average.between, 
                ch = ch, cwidegap = cwidegap, widestgap = widestgap, sindex = sindex, 
                corrected.rand = corrected.rand, vi = vi)
        } else {
            out <- list(n = n, cluster.number = cn, min.cluster.size = min(cluster.size[seq_len(cwn)]), 
                noisen = noisen, average.between = average.between, average.within = average.within, 
                max.diameter = max(diameter[seq_len(cwn)], na.rm = TRUE), 
                min.separation = sepwithnoise * min(separation) + (!sepwithnoise) * 
                  min(separation[seq_len(cwn)]), ave.within.cluster.ss = within.cluster.ss/(n - 
                  noisen), avg.silwidth = avg.width, g2 = g2, g3 = g3, pearsongamma = pearsongamma, 
                dunn = dunn, dunn2 = dunn2, entropy = h1, wb.ratio = average.within/average.between, 
                ch = ch, widestgap = widestgap, sindex = sindex, corrected.rand = corrected.rand, 
                vi = vi)
        }
    }
    out
}

############################## 

NSCA <- function(N, v = 80) {
    I <- nrow(N)  # Number of rows of table
    J <- ncol(N)  # Number of columns of table
    Inames <- dimnames(N)[1]  # Row category names
    Jnames <- dimnames(N)[2]  # Column category names
    n <- sum(N)  # Total number of classifications in the table
    p <- N * (1/n)  # Matrix of joint relative proportions
    Imass <- as.matrix(rowSums(p))
    Jmass <- as.matrix(colSums(p))
    ItJ <- Imass %*% t(Jmass)
    y <- p - ItJ
    dI <- diag(c(Imass), nrow = I, ncol = I)
    dJ <- diag(c(Jmass), nrow = J, ncol = J)
    Ih <- Imass^-0.5
    Jh <- Jmass^-0.5
    dIh <- diag(c(Ih), nrow = I, ncol = I)
    dJh <- diag(c(Jh), nrow = J, ncol = J)
    uni <- rep(1, times = J)  # A unitary vector of length J
    x <- (p %*% solve(dJ) - Imass %*% t(uni)) %*% sqrt(dJ)
    # Column Weighted Matrix of Pi
    sva <- svd(x)  # SVD of Pi
    dmu <- diag(sva$d)
    
    fbip <- sva$u
    dimnames(fbip) <- list(paste(Inames[[1]]), paste(seq_len(min(I, J))))
    
    
    f <- sva$u %*% dmu  # Row Principal Coordinates
    g <- dJh %*% sva$v %*% dmu  # Column Principal Coordinates
    dimnames(f) <- list(paste(Inames[[1]]), paste(seq_len(min(I, J))))
    dimnames(g) <- list(paste(Jnames[[1]]), paste(seq_len(min(I, J))))
    
    
    Principal.Inertia <- diag(t(f[, seq_len(min(I - 1, J - 1))]) %*% f[, seq_len(min(I - 
        1, J - 1))])
    Total.Inertia <- sum(Principal.Inertia)
    tau.num <- Total.Inertia  # Numerator of Goodman-Kruskal tau
    tau.denom <- 1 - sum(Imass^2)  # Denominator of Goodman-Kruskal tau
    tau <- tau.num/tau.denom  # Goodman-Kruskal tau index
    Ctau <- (n - 1) * (I - 1) * tau  # Light & Margolin's C-statistic
    Percentage.Inertia <- 100 * (Principal.Inertia/tau.num)
    Cumm.Inertia <- cumsum(Percentage.Inertia)
    Inertia <- cbind(Principal.Inertia, Percentage.Inertia, Cumm.Inertia)
    dimnames(Inertia)[1] <- list(paste("Axis", seq_len(min(I - 1, J - 1)), 
        sep = " "))
    q.value <- 1 - pchisq(Ctau, df = (I - 1) * (J - 1))
    inner.prod <- fbip[, seq_len(which(Cumm.Inertia >= v)[1])] %*% t(g[, seq_len(which(Cumm.Inertia >= 
        v)[1])])
    rownames(inner.prod) <- rownames(fbip)
    tau.num.j <- rowSums(apply(g^2, 2, function(x) Jmass * (x)/tau.num))
    names(tau.num.j) <- rownames(Jmass)
    
    list(N = N, f = f, fbip = fbip, g = g, tau = tau, tau.num.j = tau.num.j, 
        di = apply(f, 1, function(x) sum(x^2)), dj = apply(g, 1, function(x) sum(x^2)), 
        Cstat = Ctau, Total.Inertia = Total.Inertia, P.Value = q.value, Inertia = Inertia, 
        inner.prod = inner.prod)
}

###################################################### Short functions

range01 <- function(x) {
    (x - min(x))/(max(x) - min(x))
}

is.even <- function(x) {
    x%%2 == 0
}
fjcamlab/deco documentation built on Aug. 18, 2024, 4:28 a.m.
rdrr.io home R language documentation Run R code online
CRAN packages Bioconductor packages R-Forge packages GitHub packages
Note that we can't provide technical support on individual packages. You should contact the package authors for that.
fjcamlab/deco
Decomposing Heterogeneous Cohorts using Omic Data Profiling

R/additionalFunctions.R
In fjcamlab/deco: Decomposing Heterogeneous Cohorts using Omic Data Profiling

Documented in AnnotateDECO

R Package Documentation

Browse R Packages

We want your feedback!

fjcamlab/deco Decomposing Heterogeneous Cohorts using Omic Data Profiling

R/additionalFunctions.R In fjcamlab/deco: Decomposing Heterogeneous Cohorts using Omic Data Profiling

Documented in AnnotateDECO

R Package Documentation

Browse R Packages

We want your feedback!

fjcamlab/deco
Decomposing Heterogeneous Cohorts using Omic Data Profiling

R/additionalFunctions.R
In fjcamlab/deco: Decomposing Heterogeneous Cohorts using Omic Data Profiling