Nothing
# ==============================================================================
# Author: Anthony Sonrel
# Email: anthony.sonrel@uzh.ch
# Created date: -
# ------------------------------------------------------------------------------
# Project:
# pipeComp
#
# Description:
# Wrapper of data imputation functions, optimized for .rds or .csv files.
# Arguments of the wrapper below the function start.
# Designed to be run locally or on a server (use the correct python/ tool paths!)
# For scImpute and DCA, the wrapper will create temporary folders for
# intermediate results.
#
#
# ==============================================================================
DoDataImputation <- function(count,
method = c("SAVERX", "scImpute", "enhance",
"dca", "DrImpute",
"alra"),
organism = c("auto", "Human", "Mouse"),
out_dir = getwd(),
n_cores = 3L,
enhance_path = NULL,
alra_path = NULL,
alra_norm = FALSE,
DrImpute_prepross = TRUE,
dca_path = NULL,
eb_out = "dirichlet",
scImpute_kcluster = NULL,
kcluster_modif = 0,
restr_to_hvgs = FALSE,
python_path = '/usr/bin/python3'){
# count: path to the .rds (SCE) or .csv file containing the dataset to impute on.
# method: imputation package to use.
# organism: can keep 'auto' for the datasets we use in the pipeComp project. Else, please specify if "Human" or "Mouse".
# out_dir: deprecated, not used for the moment
# n_cores: number of cores to use for SAVERX, scImpute, DCA.
# enhance_path: path to the enhance R script tool, to dl from https://github.com/yanailab/enhance.
# alra_path: path to the 'alra.R' script, to dl from https://github.com/KlugerLab/ALRA.
# alra_norm: logical, whether to normalize the data with log counts before imputing with alra (advised in the vignettes but not stated as required).
# DrImpute_prepross: logical, filter the lowly expressed genes/ cells before running DrImpute (advised in the vignette but not stated as required) ?
# dca_path: path to the DCA bin. Alternatively, can be simply set to "dca" if the appropriate virtual environment has been set with 'reticulate'.
# python_path: to specify if you installed 'sctransfer' on a particular python path
# kcluster_modif : add or substract the estimated number of cluster by this parameter when using scimpute.
# python_path: path to python bin, passed to reticulate for SAVERX
# scImpute_kcluster: integer, number of kcluster expected in the data. Can be set. But by default, is is automatically inferred if a 'sce' if given with a 'phenoid' column in the colData or in the colnames of csv files (warning, suffix numbers are removed as it often use for cell id).
suppressPackageStartupMessages({
library(data.table)
library(SingleCellExperiment)
library(purrr)
})
if (all(c("auto", "Human", "Mouse") == organism)) organism <- "auto"
if (length(method) != 1) stop("Please provide 1 method.")
# SAVERX ===================================================================
if (method == "SAVERX") {
if (length(grep("\\.rds$|\\.txt$|\\.csv$|.rds$", count)) != 1) stop("SAVERX needs either a txt, csv or rds file.")
if (organism == "auto") {
if (length(grep("Kumar|simMix2", count)) == 1) organism <- "Mouse" else organism <- "Human"
}
suppressPackageStartupMessages({
library(SAVERX)
library(reticulate)
reticulate::use_python(python = python_path, required = TRUE)
library(tensorflow)
library(keras)
config <- tf$ConfigProto(intra_op_parallelism_threads = n_cores,
inter_op_parallelism_threads = n_cores)
session = tf$Session(config = config)
k_set_session(session)
})
if (length(grep("\\.rds$", count)) == 1) {
library(SingleCellExperiment)
data <- readRDS(count)
saveRDS(counts(data), file = "temp.rds")
res <- saverx(input.file.name = "temp.rds", data.species = organism,
clearup.python.session = TRUE, ncores = n_cores)
file.remove("temp.rds")
} else {
res <- saverx(input.file.name = count, data.species = organism,
clearup.python.session = TRUE, ncores = n_cores)
}
denois <- readRDS(res) %>% .$estimate
# file.remove(res, recursive = TRUE)
# unlink(gsub("\\/.*", "", res), recursive = TRUE)
cat("If you want to run SAVERX a second time, please restart the R session (open issue of the package).\n")
}
# SCIMPUTE ===================================================================
if (method == "scImpute") {
suppressPackageStartupMessages({
library("penalized")
library("scImpute")
})
rand_suffix <- sample(1:1000000, 1)
rand_dir <- paste0("temp_scImpute", rand_suffix, "/")
dir.create(rand_dir)
# file format
if (length(grep("\\.rds", count)) == 1) {
data <- readRDS(count)
if(!"phenoid" %in% colnames(colData(data))) stop("No 'phenoid' in the colData names. Please add it. It should contain the identity of the cells.")
infile = "rds"
if (length(scImpute_kcluster) == 0) kcluster <- length(unique(data$phenoid)) else kcluster <- scImpute_kcluster
count <- paste0("temp_scImpute", rand_suffix, ".rds")
saveRDS(counts(data), file = paste0(rand_dir, count))
input_dir <- rand_dir
} else if (length(grep("\\.csv", count)) == 1) {
infile <- "csv"
data <- read.csv(count, row.names = 1)
if (length(scImpute_kcluster) == 0) kcluster <- gsub("[0-9]*$", "", colnames(data)) %>% unique(.) %>% length(.) else kcluster <- scImpute_kcluster
input_dir <- "./"
}
cat(kcluster, "clusters recognized in the input data.")
if (kcluster_modif != 0) {
kcluster <- kcluster + kcluster_modif
cat("...modified to " , kcluster, "by 'kcluster_modif parameter.\n")
}
library(doParallel)
scimpute(paste0(input_dir, count), ncores = n_cores, Kcluster = kcluster ,
out_dir = paste0(getwd(), "/", rand_dir), infile = infile, outfile = "rds")
denois <- readRDS(paste0(rand_dir, "scimpute_count.rds"))
# unlink(rand_dir, recursive = TRUE)
}
# ALRA ===================================================================
if (method == "alra") {
library(rsvd)
source(alra_path)
if (length(grep("\\.rds", count)) == 1) {
data <- readRDS(count)
data <- as.matrix(counts(data))
} else if (length(grep("\\.csv", count)) == 1) {
data <- read.csv(count, row.names = 1)
}
data <- t(data)
if (alra_norm) data <- normalize_data(log2(data + 1))
k_choice <- choose_k(data)
denois <- alra(data, k = k_choice$k)[[3]]
rownames(denois) <- rownames(data)
if (!all(dim(denois) == dim(data))) stop("Output dims differ from input dims. Please verify!")
denois <- t(denois)
}
# DCA ========================================================================
if (method == "dca") {
if (!is(count, "character")) stop("'dca' needs raw count data in TSV/CSV format as input.")
rand_suffix <- sample(1:1000000, 1)
temp_file <- paste0("temp_DCA", rand_suffix, ".csv")
temp_fold <- paste0("temp_DCA", rand_suffix)
if (length(grep("\\.rds", count)) == 1) {
temp <- readRDS(count)
temp <- counts(temp)
# DCA only accepts integers
# --> floor the count if decimals
if (all(colSums(temp) %% 1 == 0) == FALSE) temp <- floor(temp)
write.csv(as.matrix(temp), file = temp_file, quote = FALSE)
count <- temp_file
} else {
temp <- read.csv(count, row.names = 1)
if (all(colSums(temp) %% 1 == 0) == FALSE) temp <- floor(temp)
write.csv(temp, file = temp_file, quote = FALSE)
}
command <- paste(dca_path, temp_file, temp_fold, "--threads", n_cores)
system(command = command)
denois <- read.csv(paste0(temp_fold, "/mean.tsv"),sep = "\t", row.names = 1 )
# unlink(temp_fold, recursive = TRUE)
# file.remove(temp_file)
}
# ENHANCE ====================================================================
if (method == "enhance") {
cat("Using 'enhance' tool\n")
source(enhance_path)
if (length(grep("\\.rds", count)) == 1) {
data <- readRDS(count)
data <- as.matrix(counts(data))
} else if (length(grep("\\.csv", count)) == 1) {
data <- read.csv(count, row.names = 1)
}
denois <- try(enhance(data))
if(is(denois, "try-error")) denois <- enhance(data, k_nn = 2)
}
# DrImpute ===================================================================
if (method == "DrImpute") {
suppressPackageStartupMessages({
library('DrImpute')
})
# preprocessign function from DrImpute
preprocess <- function(x, min.expressed.gene = 0, min.expressed.cell = 2, max.expressed.ratio = 1, normalize.by.size.effect = FALSE){
if (is(x, 'SummarizedExperiment'))
X <- assays(x)$count
else if (is(x,'matrix'))
X <- x
else if (is(x, 'sparseMatrix'))
X <- x
else
stop(sprintf('unknown class(x): %s', class(x)))
M <- ncol(X)
N <- nrow(X)
m <- Matrix::colSums(X > 1) >= min.expressed.gene # cells that have at least min.expressed.gene expreseed genes
n <- Matrix::rowSums(X > 1) <= max.expressed.ratio * M & Matrix::rowSums(X > 1) >= min.expressed.cell # genes that are detected in at least min.expressed.cell or at most max.expressed.ratio cells
if (normalize.by.size.effect){
sf <- apply((X[n, m] + 1) / exp(Matrix::rowMeans(log(X[n, m] + 1))), 2, median)
X <- t(t(X[n, m]) / sf)
}else
X <- X[n, m]
if (is(x, 'SummarizedExperiment')){
x <- x[n, m]
assays(x)$count <- X
}else if (is(x, 'matrix')){
x <- as.matrix(X)
}else if (is(x, 'sparseMatrix')){
x <- X
}
x
} # end of preprocess
if (length(grep("\\.rds", count)) == 1) {
data <- readRDS(count)
data <- counts(data)
}
if (length(grep("\\.csv", count)) == 1) {
data <- read.csv(count, row.names = 1)
data <- as.matrix(data)
}
if (DrImpute_prepross == TRUE) X <- preprocess(data, min.expressed.gene = 0) else X <- data
Xlog <- log(X + 1)
denois <- DrImpute(Xlog, mc.cores = n_cores)
denois <- exp(denois) - 1
}
return(denois)
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.