#' Reselect markers by thresholding
#'
#' This function generates a new list of markers based on initially detected
#' markers by CAM and/or prior markers.
#' @param data A data set that will be internally coerced into a matrix.
#' Each row is a gene and each column is a sample.
#' data should be in non-log linear space with non-negative numerical values
#' (i.e. >= 0).
#' Missing values are not supported.
#' All-zero rows will be removed internally.
#' @param MGlist A list of vectors, each of which contains CAM-detected markers
#' and/or prior markers for one subpopulation.
#' @param fc.thres The lower threshold of fold change to select markers,
#' an absolute value (e.g. 10) or a quantile value after 'q' (e.g. 'q0.5',
#' the median of fold changes of one subpopulation's input markers).
#' Each subpopulation can have its own threshold if a vector is provided.
#' The default is 'q0.5'. If NULL, still use the input markers.
#' @param err.thres The upper threshold of reconstruction error to select
#' markers, an absolute value or a quantile value after 'q' (e.g. 'q0.5',
#' the median of reconstruction errors of one subpopulation's input markers;
#' 'q1.2', the maximum error times 1.2 ).
#' Each subpopulation can have its own threshold if a vector is provided.
#' The default is NULL, which means no such a threshold is applied.
#' @details Considering some meaningful markers may be mistakenly filtered by
#' preprocessing and thus missed by \code{\link{CAM}}, this function use the
#' input marker gene list to estimate proportions by \code{\link{AfromMarkers}}
#' and then estimate expression levels. Next, a new list of markers are
#' generated by fold change threshold and reconstruction error threshold.
#'
#' The input marker genes could also be from other supervised detection and/or
#' from literatures. This function reselects a list of marker genes
#' based on the input.
#' @return A list of vectors, each of which contains new selected markers
#' for one subpopulation.
#' @export
#' @examples
#' #obtain data and run CAM
#' data(ratMix3)
#' data <- ratMix3$X
#' rCAM <- CAM(data, K = 3, dim.rdc= 3, thres.low = 0.30, thres.high = 0.95)
#' #obtain marker genes detected by CAM with a fixed K
#' MGlist <- MGsforA(rCAM, K = 3)
#' #Reselect markers from all genes
#' MGlist.re <- reselectMG(data, MGlist, fc.thres='q0.5')
#' MGlist.re <- reselectMG(data, MGlist, fc.thres='q0.5', err.thres='q0.95')
reselectMG <- function(data, MGlist, fc.thres='q0.5', err.thres=NULL) {
if (is(data, "data.frame")) {
data <- as.matrix(data)
} else if (is(data, "SummarizedExperiment")) {
data <- SummarizedExperiment::assay(data)
} else if (is(data, "ExpressionSet")) {
data <- Biobase::exprs(data)
} else if (is(data, "matrix") == FALSE) {
stop("Only matrix, data frame, SummarizedExperiment and ExpressionSet
object are supported for expression data!")
}
if (sum(is.na(data)) > 0) {
stop("Data with missing values are not supported!")
}
if (sum(data<0) > 0) {
stop("Only non-negative data are supported!")
}
data <- data[rowSums(data) > 0,]
MGlist <- lapply(MGlist, function(x) intersect(x,rownames(data)))
Xproj <- t(data/rowSums(data))
A0 <- AfromMarkers(data, MGlist, scaleRecover = FALSE)
A0.scaled <- AfromMarkers(data, MGlist)
S0 <- t(.fcnnls(A0, Xproj)$coef)
err0 <- rowMeans((t(Xproj) - S0%*%t(A0))^2)
pMGstat0 <- MGstatistic(data, A0.scaled)
#parse fold change threshold
if (length(fc.thres) == 1) {
fc.thres <- rep(fc.thres, length(MGlist))
} else if (length(fc.thres)!=length(MGlist)) {
stop("The length of fc.thres should be 1 or
equal to the length of marker lists")
}
mg.fc.thres <- rep(1,length(MGlist))
for (i in seq_along(MGlist)) {
if (is.character(fc.thres[i]) && grepl('^q', fc.thres[i])) {
mg.fc.thres[i] <-
as.numeric(substr(fc.thres[i], 2, nchar(fc.thres[i])))
if (mg.fc.thres[i] > 1 || mg.fc.thres[i] < 0) {
stop("Invalid threshld value!")
}
mg.fc.thres[i] <- quantile(
pMGstat0$OVE.FC[rownames(data) %in% MGlist[[i]]],
mg.fc.thres[i])
} else if (is.numeric(fc.thres[i]) ||
!is.na(as.numeric(fc.thres[i]))) {
mg.fc.thres[i] <- as.numeric(fc.thres[i])
} else {
stop("Invalid threshold value!")
}
}
#parse reconstruction error threshold
if (!is.null(err.thres)) {
if (length(err.thres) == 1) {
err.thres <- rep(err.thres, length(MGlist))
} else if (length(err.thres)!=length(MGlist)) {
stop("The length of err.thres should be 1 or
equal to the length of marker lists")
}
mg.err.thres <- rep(0,length(MGlist))
for (i in seq_along(MGlist)) {
if (is.character(err.thres[i]) && grepl('^q', err.thres[i])) {
mg.err.thres[i] <-
as.numeric(substr(err.thres[i], 2, nchar(err.thres[i])))
if (mg.err.thres[i] < 0) {
stop("Invalid threshld value!")
} else if (mg.err.thres[i] > 1){
mg.err.thres[i] <-
max(err0[rownames(data) %in% MGlist[[i]]]) *
mg.err.thres[i]
} else {
mg.err.thres[i] <- quantile(
err0[rownames(data) %in% MGlist[[i]]],
mg.err.thres[i])
}
} else if (is.numeric(err.thres[i]) ||
!is.na(as.numeric(err.thres[i]))) {
mg.err.thres[i] <- as.numeric(err.thres[i])
} else {
stop("Invalid threshold value!")
}
}
}
if (is.null(err.thres)) {
mgext <- lapply(seq_along(MGlist), function(x)
rownames(data)[pMGstat0$idx == x &
pMGstat0$OVE.FC >= mg.fc.thres[x]])
} else {
mgext <- lapply(seq_along(MGlist), function(x)
rownames(data)[pMGstat0$idx == x &
pMGstat0$OVE.FC >= mg.fc.thres[x] &
err0 <= mg.err.thres[x]])
}
return(mgext)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.