R/reselectMG.R

Defines functions reselectMG

Documented in reselectMG

#' Reselect markers by thresholding
#'
#' This function generates a new list of markers based on initially detected
#' markers by CAM and/or prior markers.
#' @param data A data set that will be internally coerced into a matrix.
#'     Each row is a gene and each column is a sample.
#'     data should be in non-log linear space with non-negative numerical values
#'     (i.e. >= 0).
#'     Missing values are not supported.
#'     All-zero rows will be removed internally.
#' @param MGlist A list of vectors, each of which contains CAM-detected markers
#'     and/or prior markers for one subpopulation.
#' @param fc.thres The lower threshold of fold change to select markers,
#'     an absolute value (e.g. 10) or a quantile value after 'q' (e.g. 'q0.5',
#'     the median of fold changes of one subpopulation's input markers).
#'     Each subpopulation can have its own threshold if a vector is provided.
#'     The default is 'q0.5'. If NULL, still use the input markers.
#' @param err.thres The upper threshold of reconstruction error to select
#'     markers, an absolute value or a quantile value after 'q' (e.g. 'q0.5',
#'     the median of reconstruction errors of one subpopulation's input markers;
#'     'q1.2', the maximum error times 1.2 ).
#'     Each subpopulation can have its own threshold if a vector is provided.
#'     The default is NULL, which means no such a threshold is applied.
#' @details Considering some meaningful markers may be mistakenly filtered by
#' preprocessing and thus missed by \code{\link{CAM}}, this function use the
#' input marker gene list to estimate proportions by \code{\link{AfromMarkers}}
#' and then estimate expression levels. Next, a new list of markers are
#' generated by fold change threshold and reconstruction error threshold.
#'
#' The input marker genes could also be from other supervised detection and/or
#' from literatures. This function reselects a list of marker genes
#' based on the input.
#' @return A list of vectors, each of which contains new selected markers
#'     for one subpopulation.
#' @export
#' @examples
#' #obtain data and run CAM
#' data(ratMix3)
#' data <- ratMix3$X
#' rCAM <- CAM(data, K = 3, dim.rdc= 3, thres.low = 0.30, thres.high = 0.95)
#' #obtain marker genes detected by CAM with a fixed K
#' MGlist <- MGsforA(rCAM, K = 3)
#' #Reselect markers from all genes
#' MGlist.re <- reselectMG(data, MGlist, fc.thres='q0.5')
#' MGlist.re <- reselectMG(data, MGlist, fc.thres='q0.5', err.thres='q0.95')
reselectMG <- function(data, MGlist, fc.thres='q0.5', err.thres=NULL) {
    if (is(data, "data.frame")) {
        data <- as.matrix(data)
    } else if (is(data, "SummarizedExperiment")) {
        data <- SummarizedExperiment::assay(data)
    } else if (is(data, "ExpressionSet")) {
        data <- Biobase::exprs(data)
    } else if (is(data, "matrix") == FALSE) {
        stop("Only matrix, data frame, SummarizedExperiment and ExpressionSet
            object are supported for expression data!")
    }
    if (sum(is.na(data)) > 0) {
        stop("Data with missing values are not supported!")
    }
    if (sum(data<0) > 0) {
        stop("Only non-negative data are supported!")
    }
    data <- data[rowSums(data) > 0,]
    MGlist <- lapply(MGlist, function(x) intersect(x,rownames(data)))

    Xproj <- t(data/rowSums(data))

    A0 <- AfromMarkers(data, MGlist, scaleRecover = FALSE)
    A0.scaled <- AfromMarkers(data, MGlist)
    S0 <- t(.fcnnls(A0, Xproj)$coef)
    err0 <- rowMeans((t(Xproj) - S0%*%t(A0))^2)
    pMGstat0 <- MGstatistic(data, A0.scaled)

    #parse fold change threshold
    if (length(fc.thres) ==  1) {
        fc.thres <- rep(fc.thres, length(MGlist))
    } else if (length(fc.thres)!=length(MGlist)) {
        stop("The length of fc.thres should be 1 or
            equal to the length of marker lists")
    }

    mg.fc.thres <- rep(1,length(MGlist))
    for (i in seq_along(MGlist)) {
        if (is.character(fc.thres[i]) && grepl('^q', fc.thres[i])) {
            mg.fc.thres[i] <-
                as.numeric(substr(fc.thres[i], 2, nchar(fc.thres[i])))
            if (mg.fc.thres[i] > 1 || mg.fc.thres[i] < 0) {
                stop("Invalid threshld value!")
            }
            mg.fc.thres[i] <- quantile(
                pMGstat0$OVE.FC[rownames(data) %in% MGlist[[i]]],
                mg.fc.thres[i])
        } else if (is.numeric(fc.thres[i]) ||
                !is.na(as.numeric(fc.thres[i]))) {
            mg.fc.thres[i] <- as.numeric(fc.thres[i])
        } else {
            stop("Invalid threshold value!")
        }
    }

    #parse reconstruction error threshold
    if (!is.null(err.thres)) {
        if (length(err.thres) ==  1) {
            err.thres <- rep(err.thres, length(MGlist))
        } else if (length(err.thres)!=length(MGlist)) {
            stop("The length of err.thres should be 1 or
                equal to the length of marker lists")
        }

        mg.err.thres <- rep(0,length(MGlist))
        for (i in seq_along(MGlist)) {
            if (is.character(err.thres[i]) && grepl('^q', err.thres[i])) {
                mg.err.thres[i] <-
                    as.numeric(substr(err.thres[i], 2, nchar(err.thres[i])))
                if (mg.err.thres[i] < 0) {
                    stop("Invalid threshld value!")
                } else if (mg.err.thres[i] > 1){
                    mg.err.thres[i] <-
                        max(err0[rownames(data) %in% MGlist[[i]]]) *
                        mg.err.thres[i]
                } else {
                    mg.err.thres[i] <- quantile(
                        err0[rownames(data) %in% MGlist[[i]]],
                        mg.err.thres[i])
                }
            } else if (is.numeric(err.thres[i]) ||
                    !is.na(as.numeric(err.thres[i]))) {
                mg.err.thres[i] <- as.numeric(err.thres[i])
            } else {
                stop("Invalid threshold value!")
            }
        }
    }

    if (is.null(err.thres)) {
        mgext <- lapply(seq_along(MGlist), function(x)
            rownames(data)[pMGstat0$idx == x &
                        pMGstat0$OVE.FC >= mg.fc.thres[x]])
    } else {
        mgext <- lapply(seq_along(MGlist), function(x)
            rownames(data)[pMGstat0$idx == x &
                        pMGstat0$OVE.FC >= mg.fc.thres[x] &
                        err0 <= mg.err.thres[x]])
    }

    return(mgext)
}
Lululuella/debCAM documentation built on May 14, 2021, 2:45 p.m.