R/inference.R

Defines functions readRowsFromH5 statesWrapper statesByFastaOneFile writeStatesByFastaEntries writeStates getStatesFromFasta getStates

Documented in getStates getStatesFromFasta readRowsFromH5 statesByFastaOneFile statesWrapper writeStates writeStatesByFastaEntries

#' Get cell states of semi-redundant chunks
#'
#' @param model.path path to keras model in hdf5 format
#' @param x semi-redundant chunks (one-hot)
#' @param maxlen time steps to unroll for
#' @param batch.size how many samples are trained in parallel
#' @param run.name name of output files without ending
#' @param type save-type, will save as hdf5 if type is set to 'hdf5' (default .csv)
#' @param verbose TRUE/FALSE
#' @export
getStates <- function(model.path,
                      x,
                      maxlen = 30,
                      batch.size = 100,
                      run.name = "output",
                      type = "csv",
                      verbose = F){
  
  stopifnot(maxlen > 0)
  stopifnot(batch.size > 0)
  
  model <- keras::load_model_hdf5(model.path)
  # Remove the last 2 layers
  keras::pop_layer(model)
  keras::pop_layer(model)
  states <- predict(model, x, batch_size = batch.size)
  # we dont have predictions in the beginning so create some empty cell response (set it zero)
  states.begining <- states[1:maxlen,] * 0
  states.final <- rbind(states.begining, states)
  # save states as hdf5
  if (verbose) message("Saving states ...")
  if (type == "hdf5") {
    file <- hdf5r::H5File$new(paste0(run.name, "_states.hdf5"), mode = "a")
    file.grp <- hdf5r::file.h5$create_group("states1")
    file.grp <- states.final
    hdf5r::h5close(file)
  } else {
    write.table(states.final,
                file = paste0(run.name, "_states.csv"),
                sep = ";", quote = F, col.names = F,
                row.names = F)
    
  }
  return(states.final)
}

#' Get cell states from a fasta file
#'
#' @param model keras model
#' @param fasta.path path to fasta file
#' @param maxlen time steps to unroll for
#' @param batch.size how many subsequences are predicted in parallel
#' @param verbose print output
#' @export
getStatesFromFasta <- function(model = NULL,
                               fasta.path = "example_files/fasta/a.fasta",
                               maxlen = 80,
                               batch.size = 100,
                               verbose = F){
  if (verbose)
    message("Preprocess ...")  
  # prepare fasta
  preprocessed <- deepG::preprocessFasta(fasta.path,
                                         maxlen = maxlen,
                                         vocabulary = c("\n", "a", "c", "g", "t"))
  batch.num <- 1
  batch.start <- 1
  batch.end <- batch.start + batch.size
  states <- NULL
  states <- list()
  while (batch.start < nrow(preprocessed$X)) {
    if ((batch.start + batch.size) > nrow(preprocessed$X)) {
      if (verbose)
        message("Reduce batch.size temporarily")
      batch.end <- nrow(preprocessed$X)
      # reduced batch size
    }
    if (verbose)
      message(
        paste(
          "Generating batch number",
          batch.num,
          batch.start,
          "-",
          batch.end
        ))
    x.batch <-
      preprocessed$X[batch.start:batch.end, , ] # dim should be (batch_size, length, words)
    states[[batch.num]] <- keras::predict_on_batch(model, x.batch)
    # update batch index
    batch.num <- batch.num + 1 
    batch.start <- batch.end + 1
    batch.end <- batch.start + batch.size
  }
  states.matrix <- do.call(rbind, states)
  return(states.matrix)
}

#' Write states to h5 or csv file
#' 
#' \code{writeStatesToH5} Removes layers (optional) from pretrained model and calculates states of fasta file, writes states to h5/csv file.
#' Function combines fasta entries in file to one sequence. This means predictor sequences can contain elements from more than one fasta entry. 
#' h5 file also contains sequence and positions of targets corresponding to states.   
#'     
#' @param model.path Path to a pretrained model.
#' @param layer.depth Depth of layer to evaluate. If NULL last layer is used. 
#' @param fasta.path Path to fasta file.
#' @param sequence Character string, ignores fasta.path if argument given.
#' @param round_digits Number of decimal places. 
#' @param batch.size Number of samples to evaluate at once. Does not change output, only relevant for speed and memory.  
#' @param step Frequency of sampling steps.
#' @param filename Filename to store states in.
#' @param vocabulary Vector of allowed characters, character outside vocabulary get encoded as 0-vector.
#' @param returnStates Logical scalar, return states matrix.
#' @param verbose Whether to print model before and after removing layers.
#' @param padding Logical scalar, generate states for first maxlen nucleotides by
#' padding beginning of sequence with 0-vectors.
#' @param file_type Either "h5" or "csv".
#' @param model A keras model. If model and model.path are not NULL, model will be used for inference. 
#' @param mode Either "lm" for language model or "label" for label classification.
#' @export
writeStates <- function(model.path = NULL, layer.depth = NULL, sequence = NULL, fasta.path, round_digits = 2, filename = "states.h5",
                        step = 1, vocabulary = c("a", "c", "g", "t"), batch.size = 256, verbose = TRUE,
                        returnStates = FALSE, padding = FALSE, file_type = "h5", model = NULL, mode = "lm"){
  
  stopifnot(mode %in% c("lm", "label"))
  stopifnot(file_type %in% c("h5", "csv"))
  stopifnot(batch.size > 0)
  stopifnot(is.null(layer.depth) | layer.depth > 0)
  stopifnot(!file.exists(paste0(filename, ".", file_type)) & !file.exists(filename))
  tokenizer <- keras::fit_text_tokenizer(keras::text_tokenizer(char_level = TRUE, lower = TRUE, oov_token = "0"), vocabulary)
  
  # load model and sequence/file
  if (is.null(model)){
    model <- keras::load_model_hdf5(model.path)
  }
  if (!is.null(sequence) && (!missing(sequence) & sequence != "")){
    seq <- sequence %>% stringr::str_to_lower()
  } else {
    fasta.file <- Biostrings::readDNAStringSet(fasta.path)
    seq <- paste(fasta.file, collapse = "") %>% stringr::str_to_lower()
  }
  
  # extract maxlen
  maxlen <- model$input$shape[1] 
  
  if (padding){
    seq <- paste0(paste(rep("0", maxlen), collapse = ""), seq)
  }
  
  # start of samples
  if (mode == "lm"){
    numberPotentialSamples <- ceiling((nchar(seq) - maxlen)/step)   
    start_indices <- (0:(numberPotentialSamples - 1) * step) + 1
  } else {
    numberPotentialSamples <- ceiling((nchar(seq) - maxlen + 1)/step)   
    start_indices <- (0:(numberPotentialSamples - 1) * step) + 1
  }
  
  if (!is.null(layer.depth)){
    num_of_layers <- length(model$layers)
    if (is.null(layer.depth)) layer.depth <- num_of_layers
    stopifnot(num_of_layers >= layer.depth)
    if (verbose){
      model
      cat("Original model has", num_of_layers, "layers \n")
    }
    
    # remove layers
    if (num_of_layers - layer.depth !=0){
      for (i in 1:(num_of_layers - layer.depth)){
        keras::pop_layer(model)
      }
    }
  }
  
  # extract number of neurons in last layer
  if (length(model$output$shape$dims) == 3){
    layer.size <- model$output$shape[2]
  } else {
    layer.size <- model$output$shape[1]
  }
  
  if (verbose){
    print(model)
  }
  
  # tokenize sequence
  tokSeq <- stringr::str_to_lower(seq) 
  tokSeq <- keras::texts_to_sequences(tokenizer, seq)[[1]] - 1
  
  # add file ending if not given
  if (!stringr::str_detect(filename, pattern = paste0("\\.", file_type, "$"))){
    filename <- paste0(filename, ".", file_type)
  }
  
  if (file_type == "h5"){
    # create h5 file to store states
    h5_file <- hdf5r::H5File$new(filename, mode = "w") 
    if (!missing(fasta.path)) h5_file[["fasta_file"]] <- fasta.path
    
    if (mode == "lm"){  
      if (padding){
        h5_file[["target_positions"]] <- start_indices 
      } else {
        h5_file[["target_positions"]] <- start_indices + maxlen
      }
    } else {
      if (padding){
        h5_file[["sample_end_position"]] <- start_indices - 1
      } else {
        h5_file[["sample_end_position"]] <- start_indices + maxlen - 1
      }
    }
    
    if (!padding){
      h5_file[["sequence"]] <- seq
    } else {
      h5_file[["sequence"]] <- substr(seq, maxlen + 1, nchar(seq))
    }
    h5_file[["states"]] <- array(0, dim = c(0, layer.size)) 
    writer <- h5_file[["states"]]
  }
  
  numberOfBatches <- ceiling(length(start_indices)/batch.size)
  
  row <- 1
  for (i in 1:numberOfBatches){
    # last entry might be shorter than batch.size
    if (i == numberOfBatches){
      subsetStartInd <- start_indices[((i-1) * batch.size + 1) : length(start_indices)]
      if (mode == "lm"){
        subsetSeq <- tokSeq[subsetStartInd[1]: (subsetStartInd[length(subsetStartInd)] + maxlen)]
      } else {
        subsetSeq <- tokSeq[subsetStartInd[1]: (subsetStartInd[length(subsetStartInd)] + maxlen - 1)]
      }
    } else {
      subsetStartInd <- start_indices[((i-1) * batch.size + 1) : (i*batch.size)]
      subsetSeq <- tokSeq[subsetStartInd[1] : (subsetStartInd[length(subsetStartInd)] + maxlen)]
    }
    if (mode == "lm"){
      input <- sequenceToArray(sequence = subsetSeq, maxlen = maxlen, vocabulary = vocabulary, startInd = subsetStartInd)[[1]]
    } else {
      input <- sequenceToArrayLabel(sequence = subsetSeq, maxlen = maxlen, vocabulary = vocabulary, startInd = subsetStartInd)
    }
    current_batch_size <- dim(input)[1]
    activations <- keras::predict_on_batch(object = model, x = input)
    activations <- as.array(activations)
    # some layers give predictions for every char in sequence, 
    # since return_sequences = TRUE, filter last prediction 
    if (length(dim(activations)) == 3){
      activations <- activations[ , maxlen, ]
    }
    # last entry might be shorter
    if (i == numberOfBatches){
      numRemainingSamples <- length(start_indices) - row + 1
      activation_batch <- round(activations[1:numRemainingSamples, ], digits = round_digits)
      if (file_type == "h5"){ 
        writer[row:(row + numRemainingSamples - 1), ] <- activation_batch
      } else {
        # add column with target position for lm or end position for label classification
        if (mode == "lm"){
          if (padding){
            target_position <- subsetStartInd 
          } else {
            target_position <- subsetStartInd + maxlen
          }
        } else {
          if (padding){
            target_position <- subsetStartInd - 1
          } else {
            target_position <- subsetStartInd + maxlen - 1
          }
        }  
        activation_batch <- cbind(activation_batch, target_position)
        if (i == 1){
          if (mode == "lm"){
            col.names <- c(as.character(1:layer.size), "target position")
          } else {
            col.names <- c(as.character(1:layer.size), "sample_end_position")
          }
          write.table(x = activation_batch, file = filename, col.names = col.names, row.names = FALSE, append = FALSE)
        } else {
          write.table(x = activation_batch, file = filename, col.names = FALSE, row.names = FALSE, append = TRUE)
        }
      }
    } else {
      activation_batch <- round(activations, digits = round_digits)
      if (file_type == "h5"){
        writer[row:(row + current_batch_size - 1), ] <-  activation_batch
      } else {
        # add column with target position for lm or end position for label classification
        if (mode == "lm"){
          if (padding){
            target_position <- subsetStartInd 
          } else {
            target_position <- subsetStartInd + maxlen
          }
        } else {
          if (padding){
            target_position <- subsetStartInd - 1
          } else {
            target_position <- subsetStartInd + maxlen - 1
          }
        }  
        activation_batch <- cbind(activation_batch, target_position)
        if (i == 1){
          if (mode == "lm"){
            col.names <- c(as.character(1:layer.size), "target position")
          } else {
            col.names <- c(as.character(1:layer.size), "sample_end_position")
          }
          write.table(x = activation_batch, file = filename, col.names = col.names, row.names = FALSE, append = FALSE)
        } else {
          write.table(x = activation_batch, file = filename, col.names = FALSE, row.names = FALSE, append = TRUE)
        }
      }
      row <- row + current_batch_size
    }
  }
  if (returnStates & (file_type == "h5")) states <- writer[ , ]
  if (returnStates & (file_type == "csv")) states <- read.table(filename, header = TRUE)
  if (file_type == "h5") h5_file$close_all()
  if (returnStates) return(states)
}

#' Write states to h5 file
#' 
#' @description \code{writeStatesByFastaEntries} Removes layers (optional) from pretrained model and calculates states of fasta file, writes a separate 
#' h5 file for every fasta entry in fasta file. h5 files also contain the nucleotide sequence and positions of targets corresponding to states.      
#' Names of output files are: file_path + "Nr" + i + filename + file_type, where i is the number of the fasta entry. 
#' 
#' @param model.path Path to a pretrained model.
#' @param layer.depth Depth of layer to evaluate. If NULL last layer is used. 
#' @param fasta.path Path to fasta file.
#' @param round_digits Number of decimal places. 
#' @param file_name Filename to store states, function adds "Nr" + "i" before name, where i is entry number. 
#' @param file_path Path to folder, where to write output.
#' @param step Frequency of sampling steps.
#' @param seqStart Inserts character at beginning of sequence from one file.
#' @param seqEnd Insert character at end of sequence from one file.
#' @param withinFile Insert characters between fasta entries.
#' @param vocabulary Vector of allowed characters, character outside vocabulary get encoded as 0-vector.
#' @param batch.size Number of samples to evaluate at once. Does not change output, only relevant for speed and memory.  
#' @param padding Logical scalar, generate states for first maxlen nucleotides by
#' padding beginning of sequence with 0-vectors.
#' @param file_type Either "h5" or "csv".
#' @param model A keras model. If model and model.path are not NULL, model will be used for inference.
#' @param mode Either "lm" for language model or "label" for label classification.
#' @export
writeStatesByFastaEntries <- function(model.path, layer.depth = NULL, fasta.path, round_digits = 2, file_name = "states.h5",
                                      file_path, step = 1, vocabulary = c("a", "c", "g", "t"), batch.size = 256,
                                      padding = FALSE, file_type = "h5", model = NULL, mode = "lm"){
  
  stopifnot(mode %in% c("lm", "label"))

  if (is.null(model)){
    model <- keras::load_model_hdf5(model.path)
  }
  maxlen <- model$input$shape[1] 
  
  # load fasta file
  fasta.file <- Biostrings::readDNAStringSet(fasta.path)
  df <- as.data.frame(fasta.file)
  df$header <- rownames(df)
  names(df) <- c("seq", "header") 
  rownames(df) <- NULL
  
  for (i in 1:nrow(df)){
    verbose <- FALSE
    if (i == 1) verbose <- TRUE 
    # skip entry if too short 
    if ((nchar(df[i, "seq"]) < maxlen + 1) & !padding) next
    writeStates(model.path = model.path, layer.depth = layer.depth, sequence = df[i, "seq"], 
                round_digits = round_digits, fasta.path = fasta.path,
                filename = paste0(file_path, "/Nr", as.character(i), file_name),
                step = step, vocabulary = vocabulary, batch.size = batch.size,
                verbose = verbose, padding = padding, file_type = file_type, mode = mode)  
  }
}

#' Write states to h5 file
#' 
#' @description \code{writeStatesByFastaEntries} Removes layers (optional) from pretrained model and calculates states of fasta file,
#' writes separate states matrix in one .h5 file for every fasta entry. 
#' h5 file also contains the nucleotide sequences and positions of targets corresponding to states.         
#' 
#' To acces the content of h5 file: 
#' h5_path <- "/path/to/file"
#' h5_file <- hdf5r::H5File$new(h5_path, mode = "r")
#' a <- h5_file[["states"]]
#' # shows header names
#' # names(a)
#' b <- a[["someHeaderName"]]
#' #shows state matrix 
#' #b[,]
#' h5_file$close_all()
#'   
#' 
#' @param model.path Path to a pretrained model.
#' @param layer.depth Depth of layer to evaluate. If NULL last layer is used. 
#' @param fasta.path Path to fasta file.
#' @param round_digits Number of decimal places. 
#' @param step Frequency of sampling steps.
#' @param h5.filename Filename of h5 file to store states.
#' @param vocabulary Vector of allowed characters, character outside vocabulary get encoded as 0-vector.
#' @param batch.size Number of samples to evaluate at once. Does not change output, only relevant for speed and memory.  
#' @param padding Logical scalar, generate states for first maxlen nucleotides by
#' padding beginning of sequence with 0-vectors. 
#' @param verbose Whether to print model before and after removing layers.
#' @param model A keras model. If model and model.path are not NULL, model will be used for inference. 
#' @param mode Either "lm" for language model or "label" for label classification.
#' @export
statesByFastaOneFile <- function(model.path, layer.depth = NULL, fasta.path, round_digits = 2, h5.filename = "states.h5",
                                 step = 1,  vocabulary = c("a", "c", "g", "t"), batch.size = 256,
                                 padding = FALSE, verbose = TRUE, model = NULL, mode = "lm"){
  
  stopifnot(mode %in% c("lm", "label"))
  stopifnot(batch.size > 0)
  stopifnot(is.null(layer.depth) | layer.depth > 0)
  stopifnot(!file.exists(paste0(h5.filename,".h5")) & !file.exists(h5.filename))
  
  if (is.null(model)){
    model <- keras::load_model_hdf5(model.path)
  }
  
  # extract maxlen
  maxlen <- model$input$shape[1] 
  
  if (!is.null(layer.depth)){
    num_of_layers <- length(model$layers)
    if (is.null(layer.depth)) layer.depth <- num_of_layers
    stopifnot(num_of_layers >= layer.depth)
    if (verbose){
      model
      cat("Original model has", num_of_layers, "layers \n")
    }
    
    # remove layers
    if (num_of_layers - layer.depth !=0){
      for (i in 1:(num_of_layers - layer.depth)){
        keras::pop_layer(model)
      }
    }
  }
  
  # extract number of neurons in last layer
  if (length(model$output$shape$dims) == 3){
    layer.size <- model$output$shape[2]
  } else {
    layer.size <- model$output$shape[1]
  }
  
  if (verbose){
    print(model)
  }
  
  # read fasta file and create separate row in data frame per entry
  fasta.file <- Biostrings::readDNAStringSet(fasta.path)
  df <- as.data.frame(fasta.file)
  names(df) <- "seq"
  df$length <- Biostrings::width(fasta.file)
  df$name <- names(fasta.file)
  rownames(df) <- NULL
  df$seq <- stringr::str_to_lower(df$seq) 
  df$name <- trimws(df$name)
  
  if (padding){
    for (i in 1:nrow(df)){
      df[i, "seq"] <- paste0(paste(rep("0", maxlen), collapse = ""), df[i, "seq"])
      df[i, "length"] <- df[i, "length"] + maxlen
    }
  }
  
  # tokenize sequences
  tokenizer <- keras::fit_text_tokenizer(keras::text_tokenizer(char_level = TRUE, lower = TRUE, oov_token = "0"), vocabulary)
  tokSeqComplete <- keras::texts_to_sequences(tokenizer, paste(df$seq, collapse = ""))[[1]] - 1
  
  # create h5 file to store states
  if (!stringr::str_detect(h5.filename, pattern = "\\.h5$")){
    h5.filename <- paste0(h5.filename,".h5")
  }
  file.h5 <- hdf5r::H5File$new(h5.filename, mode = "w") 
  if (!missing(fasta.path)) file.h5[["fasta_path"]] <- fasta.path
  states.grp <- file.h5$create_group("states")
  
  # loop over fasta entries
  for (j in 1:nrow(df)){
    seq <- df[j, "seq"]
    seq_length <- df[j, "length"]
    seq_name <- df[j, "name"]
    if (seq_name == "") seq_name <- paste0("dummyHeader_", as.character(j))
    if (j == 1) {
      start <- 1
    } else {
      start <- cumsum(df$length[j - 1])
    }
    end <- start + df$length[j] - 1
    tokSeq <- tokSeqComplete[start:end] 
    
    # start of samples
    if (mode == "lm"){
      numberPotentialSamples <- ceiling((nchar(seq) - maxlen)/step)   
      start_indices <- (0:(numberPotentialSamples - 1) * step) + 1
    } else {
      numberPotentialSamples <- ceiling((nchar(seq) - maxlen + 1)/step)   
      start_indices <- (0:(numberPotentialSamples - 1) * step) + 1
    }
    
    states.grp[[seq_name]] <- array(0, dim = c(0, layer.size)) 
    writer <- states.grp[[seq_name]]
    
    numberOfBatches <- ceiling(length(start_indices)/batch.size)
    
    row <- 1
    for (i in 1:numberOfBatches){
      
      # last entry might be shorter than batch.size
      if (i == numberOfBatches){
        subsetStartInd <- start_indices[((i-1) * batch.size + 1) : length(start_indices)]
        if (mode == "lm"){
          subsetSeq <- tokSeq[subsetStartInd[1]: (subsetStartInd[length(subsetStartInd)] + maxlen)]
        } else {
          subsetSeq <- tokSeq[subsetStartInd[1]: (subsetStartInd[length(subsetStartInd)] + maxlen - 1)]
        }
      } else {
        subsetStartInd <- start_indices[((i-1) * batch.size + 1) : (i*batch.size)]
        subsetSeq <- tokSeq[subsetStartInd[1] : (subsetStartInd[length(subsetStartInd)] + maxlen)]
      }
      if (mode == "lm"){
        input <- sequenceToArray(sequence = subsetSeq, maxlen = maxlen, vocabulary = vocabulary, startInd = subsetStartInd)[[1]]
      } else {
        input <- sequenceToArrayLabel(sequence = subsetSeq, maxlen = maxlen, vocabulary = vocabulary, startInd = subsetStartInd)
      }
      current_batch_size <- dim(input)[1]
      activations <- keras::predict_on_batch(object = model, x = input)
      activations <- as.array(activations)
      # some layers give predictions for every char in sequence, 
      # since return_sequences = TRUE, filter last prediction 
      if (length(dim(activations)) == 3){
        activations <- activations[ , maxlen, ]
      }
      # last entry might be shorter
      if (i == numberOfBatches){
        numRemainingSamples <- length(start_indices) - row + 1
        writer[row:(row + numRemainingSamples - 1), ] <- round(activations[1:numRemainingSamples, ],
                                                               digits = round_digits)
      } else {
        writer[row:(row + current_batch_size - 1), ] <- round(activations, digits = round_digits)
        row <- row + current_batch_size
      }
    }
    
    if (mode == "lm"){  
      if (padding){
        hdf5r::h5attr(states.grp[[seq_name]], "target_positions") <- start_indices  
      } else {
        hdf5r::h5attr(states.grp[[seq_name]], "target_positions") <- start_indices + maxlen
      }
    } else {
      if (padding){
        hdf5r::h5attr(states.grp[[seq_name]], "sample_end_position") <- start_indices - 1  
      } else {
        hdf5r::h5attr(states.grp[[seq_name]], "sample_end_position") <- start_indices + maxlen - 1
      }
    }
    
    if (!padding){
      hdf5r::h5attr(states.grp[[seq_name]], "sequence") <- seq
    } else {
      hdf5r::h5attr(states.grp[[seq_name]], "sequence") <- substr(seq, maxlen + 1, nchar(seq))
    }
  }
  file.h5$close_all()
}

#' Inference on set of fasta files
#' 
#' @description Wrapper function for inference functions \code{\link{{writeStates}}, \code{\link{{writeStatesByFastaEntries}}
#' or \code{\link{{statesByFastaOneFile}}. These functions compute states for a single fasta file, \code{statesWrapper} extends functions 
#' to compute states on a set of fasta files.    
#' 
#' Output files in \{output_path} will be named by adding "_states" + ".h5/.csv" to name of fasta file. If \code{function_name} is "writeStatesByFastaEntries",
#' number of the fasta entry will be added additionally for every fasta file. 
#'  
#' @param function_name Name of base function, either "writeStates", "writeStatesByFastaEntries" or "statesByFastaOneFile".
#' @param function_args List of arguments for function in \code{function_name}. If no new argument is given, default values will be used.
#' @param fasta.path Path to folder containing fasta files.
#' @param output_path Path to folder of output files.
#' @example statesWrapper(function_name = "writeStates", function_args = list(model.path = "/path/to/model.h5", padding = TRUE, layer_depth = 1), 
#' fasta.path = "/path/to/fasta/folder", output_path = "/where/to/write/output")
#' @export
statesWrapper <- function(function_name = "writeStates", function_args = list(), fasta.path, output_path){
  
  fasta_files <- list.files(
    path = xfun::normalize_path(fasta.path),
    pattern = paste0("\\.", "fasta", "$"),
    full.names = TRUE)
  stopifnot(length(fasta_files) > 0)
  
  fasta_names <- list.files(
    path = xfun::normalize_path(fasta.path),
    pattern = paste0("\\.", "fasta", "$"),
    full.names = FALSE)
  fasta_names <- stringr::str_remove(fasta_names, ".fasta$")
  
  # load model and remove layers before looping through files
  if (is.null(function_args$model)){
    model <- keras::load_model_hdf5(function_args$model.path)
    function_args$model <- model
  } else {
    model <- function_args$model
  }
  
  num_of_layers <- length(model$layers)
  if (is.null(function_args$layer.depth)) function_args$layer.depth <- num_of_layers
  stopifnot(num_of_layers >= function_args$layer.depth)
  if (verbose){
    model
    cat("Original model has", num_of_layers, "layers \n")
  }
  
  # remove layers
  if (num_of_layers - function_args$layer.depth !=0){
    for (i in 1:(num_of_layers - function_args$layer.depth)){
      keras::pop_layer(model)
    }
  }
  
  # layers have been removed before calling base function, thus set layer.depth to NULL (don't remove additional layers)  
  function_args$layer.depth <- NULL
  function_args$model <- model
  
  new_arguments <- names(function_args)
  default_arguments <- formals(function_name)
  
  # overwrite default arguments 
  for (arg in new_arguments){
    default_arguments[arg] <- function_args[arg] 
  }
  
  if (function_name == "writeStates") {
    formals(writeStates) <- default_arguments
  }
  if (function_name == "writeStatesByFastaEntries") {
    formals(writeStatesByFastaEntries) <- default_arguments
  } 
  if (function_name == "statesByFastaOneFile") {
    formals(statesByFastaOneFile) <- default_arguments
  } 
  
  for (i in 1:length(fasta_files)){
    
    if (function_name == "writeStates") {
      filename <- paste0(output_path, "/", fasta_names[i], "_states")
      writeStates(fasta.path = fasta_files[i], filename = filename, returnStates = FALSE, verbose = FALSE)
    } 
    
    if (function_name == "writeStatesByFastaEntries") {
      filename <- paste0(fasta_names[i], "_states")
      writeStatesByFastaEntries(file_name = filename, file_path = output_path, fasta.path = fasta_files[i], verbose = FALSE)
    } 
    
    if (function_name == "statesByFastaOneFile") {
      filename <- paste0(output_path, "/", fasta_names[i], "_states")
      statesByFastaOneFile(fasta.path = fasta_files[i], h5.filename = filename, verbose = FALSE)
    } 
  }
}

#' Read states from h5
#' 
#' Reads rows from h5 file created by  \code{\link{{writeStatesByFastaEntries}} or  \code{\link{{writeStates}},
#' rows correspond to position in sequence and columns to neurons.
#' 
#' @param h5_path Path to h5 file.
#' @param rows Range of rows to read.
#' @param complete Return all entries if TRUE.
#' @param getTargetPositions Return position of corresponding targets if TRUE.
#' @export
readRowsFromH5 <- function(h5_path, rows, verbose = TRUE, complete = FALSE, getTargetPositions = FALSE){
  h5_file <- hdf5r::H5File$new(h5_path, mode = "r")
  read_states <- h5_file[["states"]]
  read_targetPos <-  h5_file[["target_positions"]]
  if (verbose) {
    cat("states matrix has", dim(read_states[ , ])[1], "rows and ",  dim(read_states[ , ])[2], "columns \n")
  }
  if (complete){
    states <- read_states[ , ]
    targetPos <- read_targetPos[ ]
  } else {
    output <- read_lines[rows, ]
    targetPos <- read_targetPos[rows]
  }
  h5_file$close_all()
  if (getTargetPositions){
    return(list(states = states, targetPos = targetPos))
  } else {
    return(states)
  }
}
hiddengenome/deepG documentation built on April 16, 2020, 1:38 a.m.