R/fct_01_load_data.R

Defines functions showGeneIDs median_fun mean_fun max_fun sum_fun get_all_gene_names convert_data input_data safe_numeric_conversion gene_info

Documented in convert_data gene_info get_all_gene_names input_data max_fun mean_fun median_fun showGeneIDs sum_fun

#' fct_01_load_data.R This file holds all of the main data analysis functions
#' associated with first tab of the iDEP website.
#'
#'
#' @section fct_01_load_data.R functions:
#'
#'
#' @name fct_01_load_data.R
NULL

#' Retrieve detailed info on genes
#'
#' This function retrieves detailed gene information from the
#' database for the matched species.
#'
#' @param converted List returned \code{\link{convert_id}()}. Contains
#'  information about the gene IDs for the matched species.
#' @param select_org String indicating selected organism for the expression
#'  data.Default is "BestMatch."
#' @param idep_data Data files from the database obtained with
#'  \code{\link{get_idep_data}()}
#'
#' @export
#' @return A data frame containing information on all ensembl IDs for the
#'  matched species. Usually a very large data frame due to the amount
#'  of IDs that the data base contains.
#'
#' @family load data functions
gene_info <- function(converted,
                      select_org,
                      idep_data) {
  check <- check_object_state(
    check_exp = (is.null(converted)),
    true_message = as.data.frame("ID not recognized!")
  )
  if (check$bool) {
    return(check)
  }

  query_set <- converted$ids
  check <- check_object_state(
    check_exp = (length(query_set) == 0),
    true_message = as.data.frame("ID not recognized!")
  )
  if (check$bool) {
    return(check)
  }

  conn_db <- connect_convert_db_org(
    select_org = select_org,
    idep_data = idep_data
  )

  # if database connection error
  # see ? try
  if(inherits(conn_db, "try-error")) {
    return(check)
  }

  # does the db file has a geneInfo table?
  ix <- grep(
    pattern = "geneInfo",
    x = DBI::dbListTables(conn_db)
  )

  check <- check_object_state(
    check_exp = (length(ix) == 0),
    true_message = as.data.frame("No matching gene info file found")
  )
  if (check$bool) {
    return(check)
  }

  check <- check_object_state(
    check_exp = (length(ix) != 1),
    true_message = as.data.frame("Multiple geneInfo file found!")
  )
  if (check$bool) {
    return(check)
  } else {
    query_statement <- "select * from geneInfo;"
    gene_info_csv <- DBI::dbGetQuery(conn_db, query_statement)
    DBI::dbDisconnect(conn_db)

# This is removed to make it faster. Make sure the original database
# is upper case, and there is no extra space character in Symbol
#    gene_info_csv[, 1] <- toupper(gene_info_csv[, 1])
    # remove the two spaces in gene symbol
 #   gene_info_csv$symbol <- gsub(
 #     " ",
 #     "",
 #     gene_info_csv$symbol
  #  )

  }

  set <- match(gene_info_csv$ensembl_gene_id, query_set)
  set[which(is.na(set))] <- "Genome"
  set[which(set != "Genome")] <- "List"
  return(cbind(gene_info_csv, set))
}


# Safe conversion function   "11,231" -> 11231
safe_numeric_conversion <- function(x) {
  converted <- suppressWarnings(as.numeric(gsub(",", "", x)))
  ifelse(is.na(converted), NA, converted)
}


#' Load basic data information
#'
#' This function does the immediate loading of the data and
#' sample info to present in the data preview table and the
#' sample info table. The data undergoes very basic filtering
#' and transformation before entering the table.
#'
#' @param expression_file Data file path for the expression file, should be
#'  accessed with \code{expression_file$datapath}
#' @param experiment_file Data file path for the experiment file, should be
#'  accessed with \code{experiment_file$datapath}
#' @param go_button TRUE/FALSE to load the demo data files
#' @param demo_data_file Expression demo data path \code{idep_data$demo_data_file}
#' @param demo_metadata_file Experiment demo data path
#'  \code{idep_data$demo_metadata_file}
#'
#' @export
#' @return This returns a list that contains the expression data
#' and the sample information. If there is no experiment file it
#' only returns the expression data.
#'
#' @family load data functions
input_data <- function(expression_file,
                       experiment_file,
                       go_button,
                       demo_data_file,
                       demo_metadata_file) {
  in_file_data <- expression_file
  in_file_data <- in_file_data$datapath

  if (is.null(in_file_data) && go_button == 0) {
    return(NULL)
  } else if (go_button > 0) { # use demo data
    in_file_data <- demo_data_file
  }
  isolate({
    # Read expression file -----------

    file_extension <- tolower(tools::file_ext(in_file_data))
    if ( file_extension == "xlsx" || 
         file_extension == "xls"
      )  {
      data <- readxl::read_excel(in_file_data)
      data <- data.frame(data)
    } else {
       data <- read.csv(in_file_data,
         header = TRUE, stringsAsFactors = FALSE,
         quote = "\"", comment.char = "",
         blank.lines.skip = TRUE
       )
    }

    # Try tab-delimented if not CSV
    if (ncol(data) <= 1) {
      data <- read.table(
        in_file_data,
        sep = "\t",
        header = TRUE,
        stringsAsFactors = FALSE,
        quote = "\"",
        comment.char = "",
        blank.lines.skip = TRUE
      )
    }

    # try semicolon -delimented if not CSV
    if (ncol(data) <= 1) {
      data <- read.table(
        in_file_data,
        sep = ";",
        header = TRUE,
        stringsAsFactors = FALSE,
        quote = "\"",
        comment.char = "",
        blank.lines.skip = TRUE
      )
    }

    # try space-delimented if not CSV
    if (ncol(data) <= 1) {
      data <- read.table(
        in_file_data,
        sep = " ",
        header = TRUE,
        stringsAsFactors = FALSE,
        quote = "\"",
        comment.char = "",
        blank.lines.skip = TRUE
      )
    }

    
    # if more than one column
    if (ncol(data) > 1) {
      # Convert all columns after the first one to numeric using the safe function
      data[, -1] <- lapply(data[, -1], function(col) {
        if (is.character(col)) {
          return(sapply(col, safe_numeric_conversion))
        } else {
          return(col)
        }
      })

      # rows where all values after the first column are NA cause issues
      # if a row has all NA values down stream processing fails
      # Remove rows where all values after the first column are NA
      data <- data[!apply(is.na(data[, -1]), 1, all), ]

      # Identify rows where all values after the first column are NA
      #all_na_rows <- apply(is.na(data[, -1]), 1, all)
      # Change all NA values in those rows to 0, column by column
      #data[all_na_rows, -1] <- lapply(
      #  data[all_na_rows, -1],
      #  function(col) ifelse(is.na(col), 0, col)
      #)

      # remove columns where all values are NA
      data <- data[, !apply(is.na(data), 2, all)]

      # remove columns where all values are 0
      ix <- apply(data, 2, function(col) all(col == 0))
      if (sum(ix) > 0) {
        showNotification(
          ui = paste0("Warning!!! Columns with all zero values are deleted: ", 
                      paste0(names(data)[ix], collapse = ", ")),
          id = "zero_expression_file",
          duration = 10,
          type = "warning"
        )
        data <- data[, !ix]        
      }
    }

    # cannot parse file; only one or two column
    if (ncol(data) <= 1) {
      showNotification(
        ui = "Error!!! Expression file not recognized. 
        Click the Reset button, examine the file, and try again.",
        id = "error_expression_file",
        duration = NULL,
        type = "error"
      )
      return(NULL)
    }
    # Order by SD ----------
    #data <- data[order(-apply(
    #  data[, 2:ncol(data)],
    #  1,
    #  sd
    #)), ]

    # Format gene ids --------
    #iconv converts latin1 to UTF-8; otherwise toupper(ENSG00000267023Ê) causes error
    data[, 1] <- toupper(iconv(data[, 1], "latin1", "UTF-8"))
    data[, 1] <- gsub(" |\"|\'", "", data[, 1])

    # Remove duplicated genes ----------
    data <- data[!duplicated(data[, 1]), ]

    # Remove rows without genes IDs----------
    data <- data[!is.na(data[, 1]), ]

    # Set gene ids as rownames and get rid of column ---------
    rownames(data) <- data[, 1]
    data <- as.matrix(data[, c(-1)])

    # use janitor to clean up column names;  too slow!!!
    #data <- janitor::clean_names(data)

    # Remove "-" or "." from sample names ----------
    colnames(data) <- gsub("-", "", colnames(data))
    colnames(data) <- gsub("\\.", "", colnames(data))
  })

  # Read experiment file ----------
  in_file_expr <- experiment_file
  in_file_expr <- in_file_expr$datapath
  if (is.null(in_file_expr) && go_button == 0) {
    return(list(
      data = data,
      sample_info = NULL
    ))
  } else if (go_button > 0) {
    sample_info_demo <- NULL
    # if design file is not ""
    if (!is.null(demo_data_file)) {
      if (nchar(demo_metadata_file) > 2) {
        sample_info_demo <- t(read.csv(
          demo_metadata_file,
          row.names = 1,
          header = TRUE,
          colClasses = "character"
        ))
      }
    }
    return(list(
      sample_info = sample_info_demo,
      data = data
    ))
  }

  isolate({
    # Read experiment file ----------
    file_extension <- tolower(tools::file_ext(in_file_expr))
    if ( file_extension == "xlsx" || 
         file_extension == "xls"
      )  {
      expr <- readxl::read_excel(in_file_expr)
      expr <- data.frame(expr)
    } else {
      expr <- read.csv(
        in_file_expr,
        row.names = 1,
        header = TRUE,
        colClasses = "character",
        quote = "\"",
        comment.char = "",
        blank.lines.skip = TRUE
      )
    }

    # Try tab-delimented if not CSV
    if (ncol(expr) <= 1) {
      expr <- read.table(
        in_file_expr,
        row.names = 1,
        sep = "\t",
        header = TRUE,
        colClasses = "character",
        quote = "\"",
        comment.char = "",
        blank.lines.skip = TRUE
      )
    }


    # remove "-" or "." from sample names ----------
    colnames(expr) <- gsub("-", "", colnames(expr))
    colnames(expr) <- gsub("\\.", "", colnames(expr))

    rownames(expr) <- gsub("-", "", rownames(expr))
    rownames(expr) <- gsub("\\.", "", rownames(expr))
    rownames(expr) <- gsub(" ", "", rownames(expr))

    # Matching with column names of expression file ----------
    matches <- match(
      toupper(colnames(data)), toupper(colnames(expr))
    )
    matches <- matches[which(!is.na(matches))] # remove NA

    validate(need(
      length(unique(matches)) == ncol(data) &&
        nrow(expr) >= 1 && nrow(expr) < 500,
      "Error!!! Sample information file not recognized. Column names
       must be exactly the same. Each row is a factor. Each column
       represent a sample.  Please see documentation on format.
       Please click on the Reset button and try again."
    ))

    # Check factor levels, change if needed ----------
    ignored_factors <- c()
    for (i in 1:nrow(expr)) {
      expr[i, ] <- gsub("-", "", expr[i, ])
      expr[i, ] <- gsub("\\.", "", expr[i, ])
      # Remove whitespace
      expr[i, ] <- gsub(" ", "", expr[i, ])
      # Convert to upper case to avoid mixing
      expr[i, ] <- toupper(expr[i, ])
      if(length(unique(t(expr[i, ]))) == 1) {
        ignored_factors <- c(ignored_factors, i)
      }
    }

    if(length(ignored_factors) == nrow(expr)) {
      # ignore design file.
      matches <- 1
      showNotification(
        ui = "Ignoring design file. All rows have one unique levels.",
        id = "ignore_all_rows",
        duration = NULL,
        type = "default"
      )
    } else if(length(ignored_factors) > 0) {    
      # delete the row with only one level
      ignored_rows <- paste0(row.names(expr)[ignored_factors], collapse = ", ")
      showNotification(
        ui = paste("Ignoring rows with one unique levels:", ignored_rows),
        id = "ignore_some_rows",
        duration = NULL,
        type = "default"
      )
      expr <- expr[-1 * ignored_factors, ] 
    }

    # Factor levels match ---------
    if (length(unique(matches)) == ncol(data)) {
      expr <- expr[, matches]
      if (
        sum(apply(expr, 1, function(y) length(unique(y)))) >
          length(unique(unlist(expr)))) {
        factor_names <- apply(
          expr,
          2,
          function(y) paste0(names(y), y)
        )
        rownames(factor_names) <- rownames(expr)
        expr <- factor_names
      }
      return(list(
        data = data,
        sample_info = t(expr)
      ))
    } else {
      return(list(
        data = data
      ))
    }
  })
}

#' Convert expression data rownames to ensembl
#'
#' This function works with the converted ids to take in the
#' expression data and swap the rownames to the converted ids.
#' It returns the data exactly the same except for the changed
#' ids.
#'
#' @param converted Data from \code{\link{convert_id}()} function containing
#'  converted ids
#' @param no_id_conversion TRUE/FALSE for converting data ids or not
#' @param data Data from the input expression file
#' @param multiple_map String to designate how to handle values when multiple
#'   ids are matched to the same gene. Options are 'mean', 'median', 'sum',
#'   'max', and 'max_id'.
#'
#' @export
#' @return Returns original data with rownames converted to ensembl
#'
#' @family load data functions
convert_data <- function(converted,
                         data,
                         no_id_conversion,
                         multiple_map =
                           c("mean", "median", "sum", "max", "max_sd")) {
  if (is.null(converted) || no_id_conversion) {
    return(list(
      data = data,
      mapped_ids = rownames(data)
    ))
  } else {
    mapping <- converted$conversion_table

    rownames(data) <- toupper(rownames(data))
    merged <- merge(
      mapping[, 1:2],
      data,
      by.y = "row.names",
      by.x = "User_input",
      all.y = TRUE
    )
    no_match <- which(is.na(merged[, 2]))
    merged[no_match, 2] <- merged[no_match, 1]

    mapped_ids <- merged[, 1:2]

    # remove user id
    merged <- merged[, -1]

    if (multiple_map == "max_sd") {
      # Multiple matches use one with highest SD ----------
      tmp <- apply(merged[, 2:(ncol(merged))], 1, sd)
      merged <- merged[order(merged[, 1], -tmp), ]
      merged <- merged[!duplicated(merged[, 1]), ]
    } else {
      # aggregate multiple ids mapped to the same gene
      merged <- switch(multiple_map,
        "sum" = aggregate(
          . ~ ensembl_gene_id,
          data = merged,
          FUN = sum_fun,
          #na.rm = TRUE, # if used, missing values are replaced with NA.
          na.action = na.pass  # otherwise, missing values --> genes removed
        ),
        "mean" = aggregate(
          . ~ ensembl_gene_id,
          data = merged,
          FUN = mean_fun,
          na.action = na.pass
        ),
        "max" = aggregate(
          . ~ ensembl_gene_id,
          data = merged,
          FUN = max_fun,
          na.action = na.pass
        ),
        "median" = aggregate(
          . ~ ensembl_gene_id,
          data = merged,
          FUN = median_fun,
          na.action = na.pass
        )
      )
    }

    rownames(merged) <- merged[, 1]
    merged <- as.matrix(merged[, c(-1)])

    # Order by SD ----------
    merged <- merged[order(-apply(
      merged[, 1:ncol(merged)],
      1,
      function(x) sd(x, na.rm = TRUE)
    )), ]

    return(list(
      data = merged,
      mapped_ids = mapped_ids
    ))
  }
}

#' Get all matched gene names
#'
#' This function will create a data frame with all the matched
#' gene names from the database.
#'
#' @param mapped_ids Matched IDs from \code{\link{convet_id}()}
#' @param all_gene_info Gene information matched species in idep database from
#'  \code{\link{gene_info}()}
#'
#'
#' @export
#' @return Data frame containing all the matched ID names from idep
#' database. Three columns denotes a recognized species for which
#' idep had gene names for. Two columns means the IDs were converted
#' to ensembl format, but no species was found for the gene names.
#' One means no conversion occurred.
#'
#' @family load data functions
get_all_gene_names <- function(mapped_ids,
                               all_gene_info) {
  # not converted
  if (is.null(dim(mapped_ids))) {
    return(data.frame(
      "User_ID" = mapped_ids,
      "ensembl_ID" = mapped_ids, # dummy data
      "symbol" = mapped_ids # dummy data
    ))
  } else if (!is.null(all_gene_info$bool)) { # ensembl ID only, no symbol
    return(data.frame(
      "User_ID" = mapped_ids[, 1],
      "ensembl_ID" = mapped_ids[, 2],
      "symbol" = mapped_ids[, 1] # dummy data
    ))
  } else {
    mapped_ids <- data.frame(
      "User_ID" = mapped_ids[, 1],
      "ensembl_ID" = mapped_ids[, 2]
    )
    all_names <- merge(
      mapped_ids,
      all_gene_info[, c("ensembl_gene_id", "symbol")],
      by.x = "ensembl_ID",
      by.y = "ensembl_gene_id",
      all.x = T
    )
    all_names <- all_names[!duplicated(all_names$ensembl_ID), ]
    all_names$symbol[all_names$symbol == ""] <- NA
    all_names$symbol[is.na(all_names$symbol)] <- {
      all_names$ensembl_ID[is.na(all_names$symbol)]
    }
    duplicates <- all_names$symbol %in%
      all_names$symbol[duplicated(all_names$symbol)]
    all_names$symbol[duplicates] <- paste(
      all_names$symbol[duplicates],
      all_names$ensembl_ID[duplicates]
    )
    all_names <- dplyr::select(
      all_names,
      User_ID,
      ensembl_ID,
      symbol,
      tidyselect::everything()
    )

    return(all_names)
  }
}

#' Calculate sum of vector
#'
#' This function refines the sum function to handle missing value, used 
#' with the aggregate function
#'
#' @param x a vector 
#'
#' @export
#' @return a number, or NA if all the numbers are NA
#'
sum_fun <- function(x){
  if(sum(is.na(x)) == length(x)) {
    return(NA)
  } else {
    sum(x, na.rm = TRUE)
  }
}

#' Calculate max of vector
#'
#' This function refines the max function to handle missing value, used 
#' with the aggregate function
#'
#' @param x a vector 
#'
#' @export
#' @return a number, or NA if all the numbers are NA
#'
max_fun <- function(x){
  if(sum(is.na(x)) == length(x)) {
    return(NA)
  } else {
    max(x, na.rm = TRUE)
  }
}


#' Calculate mean of vector
#'
#' This function refines the mean function to handle missing value, used 
#' with the aggregate function
#'
#' @param x a vector 
#'
#' @export
#' @return a number, or NA if all the numbers are NA
#'
mean_fun <- function(x){
  if(sum(is.na(x)) == length(x)) {
    return(NA)
  } else {
    mean(x, na.rm = TRUE)
  }
}

#' Calculate median of vector
#'
#' This function refines the median function to handle missing value, used 
#' with the aggregate function
#'
#' @param x a vector 
#'
#' @export
#' @return a number, or NA if all the numbers are NA
#'
median_fun <- function(x){
  if(sum(is.na(x)) == length(x)) {
    return(NA)
  } else {
    median(x, na.rm = TRUE)
  }
}




#' Show sample gene IDs in modal
#'
#' Show example gene IDs for a selected species.
#'
#' @param species String indicating selected organism for the expression
#' @param db String indicating selected database
#' @param nGenes Number of genes to show for each idType
#'
#' @export
#' @return a data frame with example gene IDs
#'
showGeneIDs <- function(species, db, nGenes = 10){
  # Given a species ID, this function returns 10 gene ids for each idType
  if(species == "BestMatch")
    return(as.data.frame("Select a species above.") )

  converted <- NULL
  try(
    converted <- DBI::dbConnect(
      drv = RSQLite::dbDriver("SQLite"),
      dbname = paste0(DATAPATH, "/db/", db),
      flags = RSQLite::SQLITE_RO #read only mode
    ),
    silent = TRUE
  )

  if(is.null(converted)){
    showNotification(
      ui = paste("Selected database is not downloaded"),
      id = "db_notDownloaded",
      duration = 2.5,
      type = "error"
    )
    return()
  }
  removeNotification("db_notDownloaded")
  showNotification(
    ui = paste("Querying Data...  May take 5 minutes."),
    id = "ExampleIDDataQuery",
    duration = NULL,
    type = "message"
  )

  idTypes <- DBI::dbGetQuery(
    conn = converted,   
    paste0( 
      "WITH RandomIds AS (
      SELECT m.idType,
           m.id,
           ROW_NUMBER() OVER (PARTITION BY m.idType ORDER BY RANDOM()) AS rn
      FROM Mapping m
      )
      SELECT i.*, r.id AS RandomId
      FROM idIndex i
      LEFT JOIN RandomIds r ON i.id = r.idType AND r.rn <= ", nGenes, ";"
    )
  )
  DBI::dbDisconnect(converted) # suggested by GitHub Copilot

  result <- aggregate(
    RandomId ~ idType, 
    data = idTypes,
    FUN = function(x) paste(x, collapse = "; ")
  )
  colnames(result) <- c("ID Type", "Examples")
  
  # put symbols first, refseq next, followed by ensembls. Descriptions (long gnee names) last
  result <- result[ order( grepl("ensembl", result$'ID Type'), decreasing = TRUE), ]
  result <- result[ order( grepl("refseq", result$'ID Type'), decreasing = TRUE), ]
  result <- result[ order( grepl("symbol", result$'ID Type'), decreasing = TRUE), ]
  result <- result[ order( grepl("description", result$'ID Type'), decreasing = FALSE), ]

  return(result)
}

### Add Example Gene ID column to database

# -- Create a temporary table to store the concatenated Example IDs
# CREATE TEMPORARY TABLE TempExampleIds AS
# WITH RandomIds AS (
#   SELECT m.idType,
#   m.id,
#   ROW_NUMBER() OVER (PARTITION BY m.idType ORDER BY RANDOM()) AS rn
#   FROM Mapping m
# )
# SELECT i.id, GROUP_CONCAT(DISTINCT r.id) AS "ExampleIDs"
# FROM idIndex i
# LEFT JOIN RandomIds r ON i.id = r.idType AND r.rn <= 100
# GROUP BY i.id;
# 
# -- Add new column
# -- ALTER TABLE idIndex
# -- ADD ExampleIDs TEXT;
# 
# -- Fill column with example genes
# UPDATE idIndex AS i
# SET "ExampleIDs" = (
#   SELECT ExampleIDs
#   FROM TempExampleIds AS t
#   WHERE t.id = i.id
# );
# 
# -- Drop the temporary table
# DROP  TABLE TempExampleIds;
# 
# -- View results
# SELECT * FROM idIndex
espors/idepGolem documentation built on Oct. 27, 2024, 4:56 a.m.