#' generate states list for tSNE from a fasta folder
#' it is expected that the length of each fasta file is equal
#'
#' @param fasta.folder path to fasta folder
#' @param model.path path to model file
#' @param maxlen length of semi-redundant chunks
#' @param batch.size how many chunks are predicted in parallel
#' @param save if TRUE, output will be written to file
#' @param output.folder path to output folder with tailing "/"
#' @param verbose TRUE/FALSE
#' @param exact keep exact values and dont round
#' @export
generateStatesFromFolder <-
function(fasta.folder = "example_files/fasta",
model.path = "example_files/dummy_model_cpu.hdf5",
maxlen = 80,
batch.size = 500,
save = T,
output.folder = "example_files/states/",
verbose = T,
exact = F) {
files <- list.files(fasta.folder, full.names = T)
states.list <- list()
model <- keras::load_model_hdf5(model.path)
# Remove the last 2 layers
keras::pop_layer(model)
keras::pop_layer(model)
for (file in files) {
message(paste("processing: ", file))
states <- deepG::getStatesFromFasta(
model = model,
fasta.path = file,
batch.size = batch.size,
maxlen = maxlen,
verbose = T
)
states <- round(states, digits = 3)
if (save) {
write.table(
states,
file = paste0(
output.folder,
basename(tools::file_path_sans_ext(file)),
".csv"
),
quote = F,
col.names = F
)
message(paste("saved states to", paste0(
output.folder,
basename(tools::file_path_sans_ext(file)),
".csv"
)))
states.list <- NULL
} else {
# keep in memory
states.list[[tools::file_path_sans_ext(basename(file))]] <-
states
}
}
return(states.list)
}
#' Preparing states for tSNE
#'
#' @param states.list list of states generated by generateStatesromFolder
#' @param states.folder generat states list from folder containing tate files
#' @param cell.number number of cell/neuron that should go into tSNE
#' @param flatten if true, all cells will be used
#' @export
extractCellFromStates <- function(states.list = NULL,
states.folder = "example_files/states",
cell.number = 2,
flatten = FALSE) {
# if states.folder is set, no flatten option is implemented currently
if (!is.null(states.folder)) {
message(paste("extracting cell number from", length(list.files(states.folder, full.names = TRUE)), "state files"))
# read in the individual files and join them together, keep track of file names for later reference
states.list <- list()
files <- list.files(states.folder, full.names = TRUE)
if (!flatten) {
for (file in files) {
# reads in file, extracts one column form that (neuron) and saves this to the list
dat <- read.table(file, sep = " ", header = F)
dat$V1 <- NULL # first column is the position
states.list[[tools::file_path_sans_ext(basename(file))]] <- dat[, cell.number]
}
dim <- c(dim(data.frame(states.list[[1]]))[1],
length(states.list)) # should be 920 6 on example dataset
states.array <- array(as.numeric(unlist(states.list)),
dim = dim)
states <- t(states.array)
} else {
for (file in files) {
# flatten, so use all cells
dat <- read.table(file, sep = " ", header = F)
dat$V1 <- NULL # first column is the position
states.list[[tools::file_path_sans_ext(basename(file))]] <- as.vector(t(dat)) # bring 2dim array to 1D vector
}
dim <- c(dim(data.frame(states.list[[1]]))[1],
length(states.list)) # should be 920 6 on example dataset
states.array <- array(as.numeric(unlist(states.list)),
dim = dim)
states <- t(states.array)
}
} else {
dim <- c(dim(data.frame(states.list[[1]]))[1],
dim(data.frame(states.list[[1]]))[2],
length(states.list)) # should be 920 512 6 on example dataset
message(paste("dimension:", dim))
states.array <- array(as.numeric(unlist(states.list)),
dim = dim)
if (flatten) {
states <- t(do.call('rbind', lapply(1:dim(states.array)[3],
function(x)
cbind(states.array[x, , ], t = x)))) # bring 3dim array to 2D
} else {
states <- t(states.array[, cell.number, ])
}
}
return(list("ids" = names(states.list), "states" = states))
}
#' generate and plot tSNE
#'
#' @param states states for tSNE (generated by \code{(extractCellFromStates()})
#' @param perplexity perplexity parameter (should not be bigger than 3 * perplexity < nrow(X) - 1, see details for interpretation)
#' @param pca whether an initial PCA step should be performed (default: TRUE)
#' @param max_iter number of iterations (default: 1000)
#' @param path.metadata file used for coloring
#' @param sample.name.column column in metadata file that holds the sample name
#' @param label.name.column column that holds the label name in path.metadata
#' @param color.column column that holds the color information in path.metadata
#' @param out.path path to output pdf
#' @export
plotTsne <- function(states,
perplexity = c(0.5, 1),
pca = TRUE,
max_iter = 1000,
path.metadata = NULL,
sample.name.column = 1,
label.name.column = 2,
color.column = 3,
out.path = "tsne.pdf") {
pdf(out.path)
for (i in seq_along(perplexity)) {
message(paste("perplexity:", perplexity[i]))
# run tSNE
tsne <- Rtsne::Rtsne(
states$states,
dims = 2,
perplexity = perplexity[i],
check_duplicates = FALSE,
pca = pca,
max_iter = max_iter
)
# plot
if (!is.null(path.metadata)) {
meta <- read.csv2(path.metadata,
header = F,
stringsAsFactors = F)
colors <-
meta[match(states$ids, meta[, sample.name.column]), ][, color.column]
labels <-
meta[match(states$ids, meta[, sample.name.column]), ][, label.name.column]
tsne.df <- data.frame(
x = tsne$Y[, 1],
y = tsne$Y[, 2],
labels = labels,
col = colors,
stringsAsFactors = F
)
p <-
ggplot2::ggplot(tsne.df, ggplot2::aes(x = x, y = y, color = labels))
} else {
tsne.df <- data.frame(
x = tsne$Y[, 1],
y = tsne$Y[, 2],
stringsAsFactors = F
)
p <- ggplot2::ggplot(tsne.df, ggplot2::aes(x = x, y = y))
}
p <- p + ggplot2::geom_point()
p <- p + ggplot2::theme_bw()
p <- p + ggplot2::ggtitle(paste("perplexity:", perplexity[i]))
print(p)
}
dev.off()
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.