#' fix_bad_mgi_symbols
#' - Given an expression matrix, wherein the rows are supposed to be MGI
#' symbols, find those symbols which are not official MGI symbols, then
#' check in the MGI synonm database for whether they match to a proper MGI
#' symbol. Where a symbol is found to be an aliases for a gene that is already
#' in the dataset, the combined reads are summed together.
#'
#' Also checks whether any gene names contain "Sep", "Mar" or "Feb".
#' These should be checked for any suggestion that excel has corrupted the
#' gene names.
#' @param exp An expression matrix where the rows are MGI symbols, or a
#' SingleCellExperiment (SCE) or
#' other Ranged Summarized Experiment (SE) type object.
#' @param mrk_file_path Path to the MRK_List2 file which can be downloaded
#' from www.informatics.jax.org/downloads/reports/index.html
#' @param printAllBadSymbols Output to console all the bad gene symbols
#' @param as_sparse Convert \code{exp} to sparse matrix.
#' @param verbose Print messages.
#' @param localHub If working offline, add argument localHub=TRUE to work
#' with a local, non-updated hub; It will only have resources available that
#' have previously been downloaded. If offline, Please also see BiocManager
#' vignette section on offline use to ensure proper functionality.
#'
#' @returns Returns the expression matrix with the rownames corrected and rows
#' representing the same gene merged. If no corrections are necessary, input
#' expression matrix is returned. If a SingleCellExperiment (SCE) or other
#' Ranged Summarized Experiment (SE) type object was inputted this will be
#' returned with the corrected expression matrix under counts.
#'
#' @examples
#' # Load the single cell data
#' cortex_mrna <- ewceData::cortex_mrna()
#' # take a subset for speed
#' cortex_mrna$exp <- cortex_mrna$exp[1:50, 1:5]
#' cortex_mrna$exp <- fix_bad_mgi_symbols(cortex_mrna$exp)
#' @export
#' @importFrom SummarizedExperiment rowRanges assays
#' @importFrom methods as is
#' @importFrom utils read.csv
fix_bad_mgi_symbols <- function(exp,
mrk_file_path = NULL,
printAllBadSymbols = FALSE,
as_sparse = TRUE,
verbose = TRUE,
localHub = FALSE) {
# Check arguments
err_msg <- paste0(
"'exp' is null. It should be a numerical",
" matrix with the rownames being MGI symbols."
)
if (is.null(exp)) {
stop(err_msg)
}
# Check if input is an SCE or SE object
res_sce <- check_sce(exp)
exp <- res_sce$exp
SE_obj <- res_sce$SE_obj
SE_exp <- res_sce$SE_exp
err_msg2 <- paste0(
"Input 'exp' should not contain factors.",
" Perhaps stringsAsFactors was not set while loading."
)
if (!is.null(levels(exp[1, 3]))) {
stop(err_msg2)
}
warn_msg <- paste0(
"Warning: Input 'exp' stored as characters. Converting",
" to numeric. Check that it looks correct."
)
#### Convert to data.table --> data.frame ####
exp <- dt_to_df(exp = exp)
#### Convert characters to numbers ####
exp <- check_numeric(exp = exp)
# Check that exp is not some weird input format like
# those generated by readr functions
if (!any(
is_sparse_matrix(exp),
is_matrix(exp),
is_delayed_array(exp),
is.data.frame(exp)
)) {
stop(
"exp must be either a data.frame",
" matrix (sparse or dense), DelayedArray."
)
}
# Check for symbols which are not real MGI symbols
all_mgi <- ewceData::all_mgi(localHub = localHub)
not_MGI <- rownames(exp)[!rownames(exp) %in% all_mgi]
messager(sprintf("%s rows do not have proper MGI symbols", length(not_MGI)),
v = verbose
)
if (length(not_MGI) > 20) {
messager(paste(not_MGI[seq_len(20)], collapse = ", "), v = verbose)
} else {
messager(paste(not_MGI, collapse = ", "), v = verbose)
}
# if no improper MGI symbols return input
if (length(not_MGI) == 0) {
if (SE_obj) { # if SCE/SE obj inputted return that
return(SE_exp)
} else {
return(exp)
}
}
# Checking for presence of bad date genes, i.e. Sept2 --> 02.Sep
date_like <- not_MGI[grep("Sep|Mar|Feb", not_MGI)]
if (length(date_like) > 0) {
warning(
sprintf(
"Possible presence of excel corrupted date-like genes: %s",
paste(date_like, collapse = ", ")
)
)
}
err_msg3 <- paste0(
"The file path used in mrk_file_path does not",
" direct to a real file. Either leave this argument",
" blank or direct to an MRK_List2.rpt file downloaded",
" from the MGI website. It should be possible to ",
"obtain from: http://www.informatics.jax.org/",
"downloads/reports/MRK_List2.rpt"
)
err_msg4 <- paste0(
"The MRK_List2.rpt file does not seem to have",
" a column named 'Marker.Synonyms..pipe.separated.'"
)
# Load data from MGI to check for synonyms
if (is.null(mrk_file_path)) {
mgi_data <- ewceData::mgi_synonym_data(localHub = localHub)
} else {
if (!file.exists(mrk_file_path)) {
stop(err_msg3)
}
mgi_data <- utils::read.csv(mrk_file_path,
sep = "\t",
stringsAsFactors = FALSE
)
mgi_synonym_data <- ewceData::mgi_synonym_data(localHub = localHub)
if (!"Marker.Synonyms..pipe.separated." %in%
colnames(mgi_synonym_data)) {
stop(err_msg4)
}
# file is downloaded from:
# http://www.informatics.jax.org/downloads/reports/index.html
mgi_data <- mgi_data[!mgi_data$Marker.Synonyms..pipe.separated. == "", ]
}
# If there are too many genes in not_MGI then the grep crashes...
# so try separately
stepSize <- 500
if (length(not_MGI) > stepSize) {
lower <- 1
upper <- stepSize
for (i in seq_len(ceiling(length(not_MGI) / stepSize))) {
if (upper > length(not_MGI)) {
upper <- length(not_MGI)
}
use_MGI <- not_MGI[lower:upper]
tmp <- grep(
paste(use_MGI, collapse = ("|")),
mgi_data$Marker.Synonyms..pipe.separated.
)
if (i == 1) {
keep_rows <- tmp
} else {
keep_rows <- c(keep_rows, tmp)
}
lower <- lower + stepSize
upper <- upper + stepSize
}
} else {
keep_rows <- grep(
paste(not_MGI, collapse = ("|")),
mgi_data$Marker.Synonyms..pipe.separated.
)
}
countBottom <- countTop <- 0
# Count how many "|" symbols are in
# "mgi_data$Marker.Synonyms..pipe.separated" to determine how many rows
# the dataframe needs
allSYN <- matrix("", nrow = length(keep_rows) +
sum(stringr::str_count(
mgi_data$Marker.Synonyms..pipe.separated.,
"\\|"
)), ncol = 2)
colnames(allSYN) <- c("mgi_symbol", "syn")
for (i in keep_rows) {
# if(i %% 100 == 0){print(i)}
syn_data <- unlist(strsplit(
mgi_data$Marker.Synonyms..pipe.separated.[i],
"\\|"
))
tmp <- data.frame(
mgi_symbol = mgi_data[i, ]$Marker.Symbol,
syn = syn_data,
stringsAsFactors = FALSE
)
countBottom <- countTop + 1
countTop <- countBottom + dim(tmp)[1] - 1
allSYN[countBottom:countTop, 1] <- tmp[, 1]
allSYN[countBottom:countTop, 2] <- tmp[, 2]
}
allSYN <- data.frame(allSYN)
matchingSYN <- allSYN[allSYN$syn %in% not_MGI, ]
matchingSYN <- matchingSYN[!duplicated(matchingSYN$syn), ]
matchingSYN <- matchingSYN[as.character(matchingSYN$mgi_symbol) !=
as.character(matchingSYN$syn), ]
rownames(matchingSYN) <- matchingSYN$syn
# Check for duplicates of existing genes
dupGENES <- as.character(matchingSYN$mgi_symbol[matchingSYN$mgi_symbol %in%
rownames(exp)])
msg <- paste0(
"%s poorly annotated genes are replicates of existing genes.",
" These are: "
)
messager(sprintf(msg, length(unique(dupGENES))), v = verbose)
messager(paste(unique(dupGENES), collapse = ", "), v = verbose)
#### Replace mis-used synonyms from the expression data ####
exp <- as.matrix(exp) # IMPORTANT! Allows apply() to work
exp_Good <- exp[!rownames(exp) %in% as.character(matchingSYN$syn), ]
exp_Bad <- exp[as.character(matchingSYN$syn), ]
prelim_exp_Bad <- exp_Bad
# exp_Bad is a vector if only one bad value so need to convert to a
# one line matrix
if (methods::is(exp_Bad, "numeric")) {
exp_Bad <- Matrix::t(as.matrix(exp_Bad))
rownames(exp_Bad) <- as.character(matchingSYN$syn)
}
# Where duplicates exist, sum them together
for (dG in unique(dupGENES)) {
old_good_dG <- exp_Good[rownames(exp_Good) == dG, ]
exp_Good[rownames(exp_Good) == dG, ] <-
apply(rbind(
exp_Good[rownames(exp_Good) == dG, ],
exp_Bad[as.character(matchingSYN$mgi_symbol) ==
dG, ]
), 2, sum)
}
exp_Bad <- exp_Bad[!as.character(matchingSYN$mgi_symbol) %in% dupGENES, ]
matchingSYN_deDup <-
matchingSYN[!as.character(matchingSYN$mgi_symbol) %in% dupGENES, ]
dropDuplicatedMislabelled <-
as.character(matchingSYN_deDup$syn[
duplicated(matchingSYN_deDup$mgi_symbol)
])
matchingSYN_deDup <-
matchingSYN_deDup[
!matchingSYN_deDup$syn %in% dropDuplicatedMislabelled,
]
exp_Bad <- exp_Bad[!rownames(exp_Bad) %in% dropDuplicatedMislabelled, ]
# Replace bad names with mgi symbols
syn_exp_Bad <- rownames(exp_Bad)
exp_Bad_old <- exp_Bad
rownames(exp_Bad) <- as.character(matchingSYN_deDup$mgi_symbol)
# combine results and return
new_exp <- rbind(exp_Good, exp_Bad)
new_exp <- to_sparse_matrix(
exp = new_exp,
as_sparse = as_sparse,
verbose = verbose
)
messager(sprintf(
"%s rows should have been corrected by checking synonyms.",
dim(matchingSYN)[1]
), v = verbose)
still_not_MGI <- sort(rownames(new_exp)[!rownames(new_exp) %in% all_mgi])
messager(sprintf(
"%s rows STILL do not have proper MGI symbols.",
length(still_not_MGI)
), v = verbose)
if (printAllBadSymbols == TRUE) {
messager(paste(still_not_MGI, collapse = ", "), v = verbose)
}
# Now filter results in SE/SCE obj if inputted and return it
if (SE_obj) {
# Update all annotation datasets by replacing by corrected counts, add
# in annotation and meta data if available
SE_exp <- SE_exp[seq_len(nrow(new_exp)), ] # match the number of rows
names(SummarizedExperiment::rowRanges(SE_exp)) <-
rownames(new_exp) # Update gene names
SummarizedExperiment::assays(SE_exp) <- list(counts = new_exp)
return(SE_exp)
}
return(new_exp)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.