#' Runs unsupervised version of MetaNeighbor
#'
#' When it is difficult to know how cell type labels compare across datasets this
#' function helps users to make an educated guess about the overlaps without
#' requiring in-depth knowledge of marker genes
#'
#' @param var_genes vector of high variance genes.
#' @param dat SummarizedExperiment object containing gene-by-sample
#' expression matrix.
#' @param i default value 1; non-zero index value of assay containing the matrix
#' data
#' @param study_id a vector that lists the Study (dataset) ID for each sample
#' @param cell_type a vector that lists the cell type of each sample
#' @param fast_version default value FALSE; a boolean flag indicating whether
#' to use the fast and low memory version of MetaNeighbor
#'
#' @return The output is a cell type-by-cell type mean AUROC matrix, which is
#' built by treating each pair of cell types as testing and training data for
#' MetaNeighbor, then taking the average AUROC for each pair (NB scores will not
#' be identical because each test cell type is scored out of its own dataset,
#' and the differential heterogeneity of datasets will influence scores).
#'
#' @examples
#' data(mn_data)
#' var_genes = variableGenes(dat = mn_data, exp_labels = mn_data$study_id)
#' celltype_NV = MetaNeighborUS(var_genes = var_genes,
#' dat = mn_data,
#' study_id = mn_data$study_id,
#' cell_type = mn_data$cell_type,
#' fast_version=FALSE)
#' celltype_NV
#'
#' @export
#'
MetaNeighborUS <- function(var_genes, dat, i = 1, study_id, cell_type, fast_version = FALSE){
dat <- SummarizedExperiment::assay(dat, i = i)
samples <- colnames(dat)
#check obj contains study_id
if(length(study_id)!=length(samples)){
stop('study_id length does not match number of samples')
}
#check obj contains cell_type
if(length(cell_type)!=length(samples)){
stop('cell_type length does not match number of samples')
}
matching_vargenes <- match(rownames(dat), var_genes)
matching_vargenes_count <- sum(!is.na(matching_vargenes))
if(matching_vargenes_count < 2){
stop("matching_vargenes should have more than 1 matching genes!",
call. = TRUE)
} else if(matching_vargenes_count < 5) {
warning("matching_vargenes should have more matching genes!",
immediate. = TRUE)
}
dat <- dat[!is.na(matching_vargenes),]
study_id <- as.character(study_id)
cell_type <- as.character(cell_type)
if (fast_version) {
cell_NV <- MetaNeighborUSLowMem(dat, study_id, cell_type)
} else {
cell_NV <- MetaNeighborUSDefault(dat, study_id, cell_type)
}
cell_NV <- (cell_NV+t(cell_NV))/2
return(cell_NV)
}
MetaNeighborUSDefault <- function(dat, study_id, cell_type) {
dat <- as.matrix(dat)
pheno <- as.data.frame(cbind(study_id,cell_type), stringsAsFactors = FALSE)
pheno$StudyID_CT <- paste(pheno$study_id, pheno$cell_type, sep = "|")
celltypes <- unique(pheno$StudyID_CT)
cell_labels <- matrix(0, ncol=length(celltypes), nrow=dim(pheno)[1])
rownames(cell_labels) <-colnames(dat)
colnames(cell_labels) <- celltypes
for(i in seq_along(celltypes)){
type <- celltypes[i]
matching_celltype <- match(pheno$StudyID_CT, type)
cell_labels[!is.na(matching_celltype),i] <- 1
}
cor_data <- stats::cor(dat, method="s")
rank_data <- cor_data*0
rank_data[] <- rank(cor_data, ties.method = "average", na.last = "keep")
rank_data[is.na(rank_data)] <- 0
rank_data <- rank_data/max(rank_data)
sum_in <- (rank_data) %*% cell_labels
sum_all <- matrix(apply(rank_data, MARGIN = 2, FUN = sum),
ncol = dim(sum_in)[2],
nrow = dim(sum_in)[1])
predicts <- sum_in/sum_all
cell_NV <- matrix(0, ncol=length(celltypes), nrow=length(celltypes))
colnames(cell_NV) <- colnames(cell_labels)
rownames(cell_NV) <- colnames(cell_labels)
for(i in seq_len(dim(cell_labels)[2])){
predicts_temp <- predicts
matching_celltype <- match(pheno$StudyID_CT, colnames(cell_labels)[i])
unique_studyID <- unique(pheno[!is.na(matching_celltype),"study_id"])
matching_studyID <- match(pheno$study_id, unique_studyID)
pheno2 <- pheno[!is.na(matching_studyID),]
predicts_temp <- predicts_temp[!is.na(matching_studyID),]
predicts_temp <- apply(abs(predicts_temp),
MARGIN = 2,
FUN = rank,
na.last= "keep",
ties.method="average")
filter <- matrix(0, ncol=length(celltypes), nrow=dim(pheno2)[1])
matches <- match(pheno2$StudyID_CT, colnames(cell_labels)[i])
filter[!is.na(matches),seq_along(celltypes)] <- 1
negatives = which(filter == 0, arr.ind = TRUE)
positives = which(filter == 1, arr.ind = TRUE)
predicts_temp[negatives] <- 0
np <- colSums(filter, na.rm = TRUE)
nn <- apply(filter, MARGIN = 2, FUN = function(x) sum(x==0,na.rm=TRUE))
p <- apply(predicts_temp, MARGIN = 2, FUN = sum, na.rm = TRUE)
cell_NV[i,]= (p/np - (np+1)/2)/nn
}
return(cell_NV)
}
MetaNeighborUSLowMem <- function(dat, study_id, cell_type, skip_network = TRUE) {
dat <- normalize_cols(dat)
colnames(dat) <- paste(study_id, cell_type, sep = "|")
studies <- unique(study_id)
data_subsets <- find_subsets(study_id, studies)
result <- create_result_matrix(colnames(dat))
for (study_A_index in seq_along(studies)) {
study_A <- dat[, data_subsets[, study_A_index]]
for (study_B_index in study_A_index:length(studies)) {
study_B <- dat[, data_subsets[, study_B_index]]
# study B votes for study A
if (skip_network) {
votes <- compute_votes_without_network(study_A, study_B)
} else {
network <- build_network(study_A, study_B)
votes <- compute_votes_from_network(network)
}
aurocs <- compute_aurocs(votes)
result[rownames(aurocs), colnames(aurocs)] <- aurocs
# study A votes for study B
if (skip_network) {
votes <- compute_votes_without_network(study_B, study_A)
} else {
votes <- compute_votes_from_network(t(network))
}
aurocs <- compute_aurocs(votes)
result[rownames(aurocs), colnames(aurocs)] <- aurocs
}
}
return(result)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.