#' ID markers
#'
#' This will perform marker gene identification
#'
#' @param input the input ex_sc
#' @param print_progress will print progress if TRUE
#' @param id_by a pData variable to run id_markers on. Defaults to "Cluster"
#' @param overwrite if TRUE will overwrite previously calculated scores on the same id_by variable
#' @export
#' @details
#' This will find marker genes for each cluster. First, for each cluster it calculates the fraction of cells expressing a given gene,
#' and the mean expression of that gene (within the cluster).
#' It then compares these values to the values of all other groups looking for genes which maximize the distance in both
#' fraction expressing and the mean expression.
#' @examples
#' ex_sc_example <- id_markers(input = ex_sc_example, print_progress = TRUE)
id_markers <- function(input, id_by = "Cluster", print_progress = TRUE, overwrite = FALSE){
marker_input <- input
cols <- (unique(pData(input)[,id_by]))
cols <- as.character(cols)
cols <- sort(cols)
ind <- grep("marker_score_", colnames(fData(marker_input)))
if(length(ind) > 0){
if(overwrite == FALSE){
stop("Markers already calculated. Set overwrite to TRUE to recalculate")
}
if(overwrite == TRUE){
fData(marker_input) <- as.data.frame(fData(marker_input)[,-ind])
}
}
fData(marker_input)$tmp <- "tmp"
if(print_progress == TRUE){
print("Finding markers based on fraction expressing")
}
num_cluster <- length(unique(pData(marker_input)[,id_by]))
num_marker_genes <- nrow(fData(input))
non0 <- c()
percent <- c()
gene_percent <- c()
for(n in 1:length(cols)){
cluster <- cols[n]
index <- which(pData(marker_input)[,id_by] == cluster)
cells <- rownames(pData(marker_input))[index]
tmp_expr_matrix <- exprs(marker_input)[,cells]
non0 <- which(tmp_expr_matrix > 0)
tmp_expr_matrix[non0]<- 1
for(i in 1:nrow(tmp_expr_matrix)){
numExpr <- sum(tmp_expr_matrix[i,])
percent <- numExpr/ncol(tmp_expr_matrix)
gene_percent <- c(gene_percent, percent)
}
}
gene_markers_fractionExpression <- matrix(data = gene_percent, nrow = nrow(fData(marker_input)), ncol = length(unique(pData(marker_input)[,id_by])))
colnames(gene_markers_fractionExpression) <- c(seq(1:num_cluster))
rownames(gene_markers_fractionExpression) <- rownames(fData(marker_input))
gene_markers_meandiff <- matrix(ncol = ncol(gene_markers_fractionExpression), nrow=nrow(gene_markers_fractionExpression))
colnames(gene_markers_meandiff) <- c(seq(1:num_cluster))
rownames(gene_markers_meandiff) <- rownames(fData(marker_input))
for(i in 1:nrow(gene_markers_fractionExpression)){
genetest <- gene_markers_fractionExpression[i,]
for(j in 1:ncol(gene_markers_fractionExpression)){
refer <- genetest[j]
test <- genetest[-j]
meandiff <- mean(refer - test)
gene_markers_meandiff[i,j] <- meandiff
}
}
cluster_markers <- matrix(ncol = ncol(gene_markers_meandiff), nrow=num_marker_genes)
colnames(cluster_markers) <- c(seq(1:num_cluster))
for(i in 1:ncol(gene_markers_meandiff)){
sorted <- sort(gene_markers_meandiff[,i])
interested_genes <- names(sorted)
cluster_markers[,i] <- interested_genes
}
if(print_progress == TRUE){
print("Finding markers based on mean expressing")
}
mean_exp <- c()
mean_data <- c()
for(n in 1:length(cols)){
cluster <- cols[n]
index <- which(pData(marker_input)[,id_by] == cluster)
cells <- rownames(pData(marker_input))[index]
tmp_expr_matrix <- exprs(marker_input)[,cells]
for(i in 1:nrow(tmp_expr_matrix)){
mean_exp <- mean(tmp_expr_matrix[i,])
mean_data <- c(mean_data, mean_exp)
}
}
gene_means <- matrix(data = mean_data, nrow = nrow(fData(marker_input)), ncol = length(unique(pData(marker_input)[,id_by])))
colnames(gene_means) <- c(seq(1:num_cluster))
rownames(gene_means) <- rownames(fData(marker_input))
gene_markers_foldchange <- matrix(ncol = ncol(gene_means), nrow=nrow(gene_means))
colnames(gene_markers_foldchange) <- c(seq(1:num_cluster))
rownames(gene_markers_foldchange) <- rownames(fData(marker_input))
for(i in 1:nrow(gene_means)){
genetest <- gene_means[i,]+.001
for(j in 1:ncol(gene_means)){
refer <- genetest[j]
test <- genetest[-j]
meandiff <- mean(log2(refer / test))
gene_markers_foldchange[i,j] <- meandiff
}
}
cluster_markers_FC <- matrix(ncol = ncol(gene_markers_foldchange), nrow=num_marker_genes)
colnames(cluster_markers_FC) <- c(seq(1:num_cluster))
for(i in 1:ncol(gene_markers_foldchange)){
sorted <- sort(gene_markers_foldchange[,i])
interested_genes <- names(sorted)
cluster_markers_FC[,i] <- interested_genes
}
if(print_progress == TRUE){
print("Merging Lists")
}
marker_genes <- list()
for(i in 1:length(cols)){
cluster <- cols[i]
cm <- cluster_markers[,i]
cf <- cluster_markers_FC[,i]
cm <- as.data.frame(cm)
cf <- as.data.frame(cf)
rownames(cm) <- cm$cm
rownames(cf) <- cf$cf
cm$rankcm <- seq(1:nrow(cm))
cf$rankcf <- seq(1:nrow(cf))
cm <- cm[order(cm$cm),]
cf <- cf[order(cf$cf),]
final <- cbind(cm, cf)
final <- final[,c(2,4)]
final$score <- apply(final,1,sum)
final <- final[order(-final$score),]
cluster_marker_rank <- rownames(final)
val <- match(rownames(fData(input)), cluster_marker_rank)
fData(marker_input)$name <- val
colnames(fData(marker_input))[grep("name", colnames(fData(marker_input)))] <- paste0(cluster,"_marker_score_", id_by) #HERE BE THE BUG!!!
}
fData(marker_input) <- fData(marker_input)[,-match("tmp", colnames(fData(marker_input)))]
return(marker_input)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.