#' Filter data frame by term
#'
#' @param df Data frame
#' @param query Character string for fts search
#' @param filter_columns Character vector with column names
#' @return A data frame subset according to query conditions
#'
#' @family Workflow functions
#' @family Manipulation functions
#'
#' @description
#' Filter a data frame using full-text search within the data frame.
#' If filter_columns are not specified, search will be performed
#' within the whole data frame.
#' If filter_columns are provided,
#' search will only be limited to these columns.
#' Full-text search works akin to searching within
#' web search engines like Google.
#' You can find out more about it here: https://www.sqlite.org/fts3.html
#' (fts3 is used for consistency with fts search within the SRAdb database).
#'
#' @examples
#' startSpiderSeqRDemo()
#' df <- searchForAccession("SRP026280")
#'
#' # Filter by 'TRIMKD' anywhere within df
#' df <- filterByTerm(df, "TRIMKD")
#' # NOTE: here (as is often the case), the study-level columns
#' # contain information pertaining to all samples and make finding relevant
#' # samples impossible
#'
#' # Filter by 'TRIMKD' only within SRA_experiment_title column
#' df <- filterByTerm(df, "SRA_experiment_title: TRIMKD")
#' # NOTE: you can explore further possibilities by filtering by columns
#' # corresponding to different accession levels,
#' # see \code{\link{filterByTermByAccessionLevel}}
#'
#' # For ease of viewing results
#' df_view <- selectColumns_Overview(df)
#'
#' @export
#'
filterByTerm <- function(df, query, filter_columns=NULL){
# - Check all columns within df
# - add ord column with an index (in case duplicates exist etc.)
# - subset df according to columns
# (- make sure sqlite file does not exist, if it does, delete it)
# - create a database, with fts table
# - fts query to database
# - filter the original df based on the query result
# - remove additional ord column
filter_con <- NULL
if (exists("filter_con", envir = .GlobalEnv)){
rm("filter_con", envir = .GlobalEnv)
}
ord <- NULL
rm("filter_con", envir = environment())
if (!("data.frame" %in% class(df))){
stop("Object needs to be a data frame")
}
if ("ord" %in% colnames(df)){
warning("Data frame contains 'ord' column, which will be overwritten")
}
# Create a new column with an index
df$ord <- seq(dim(df)[1])
.mm("Filter Columns", "dev")
.mm(paste0(filter_columns, collapse = "\n"), "dev")
#print(filter_columns)
#print(colnames(df))
if (!is.null(filter_columns)){
if (!all(filter_columns %in% colnames(df))){
stop("All columns need to be within the data frame")
}
# Get logical of column indices to search within, including ord
col_ind <- colnames(df) %in% c("ord", filter_columns)
} else {
col_ind <- rep(TRUE, dim(df)[2])
}
df_subset <- df[, col_ind]
#print(class(df_subset))
#print(df_subset)
.vex("temp_df_subset", df_subset)
# Create db
filter_db_file <- "filter_db.sqlite"
if (file.exists(filter_db_file)) file.remove(filter_db_file)
#----if (dim(df)[1] ==0) stop("Empty data frame")
#----df <- as.data.frame(df)
# .GlobalEnv$filter_con <- DBI::dbConnect(RSQLite::SQLite(),
# dbname = filter_db_file)
.GlobalEnv$filter_con <- DBI::dbConnect(RSQLite::SQLite(),
dbname = filter_db_file)
# RSQLite::dbWriteTable(conn = get("filter_con", envir = .GlobalEnv),
# name="df", value = df_subset, field.types = NULL,
# row.names = FALSE, append = TRUE)
RSQLite::dbWriteTable(conn = get("filter_con",
envir = .GlobalEnv), name="df",
value = df_subset)
# Create fts table of the db
.createFtsTable("filter_con", "df", "df_ft")
query <- paste0("SELECT ord FROM df_ft WHERE df_ft MATCH '", query, "'")
# This will be a df
filt_ord <- DBI::dbGetQuery(.GlobalEnv$filter_con, query)$ord
DBI::dbDisconnect(filter_con)
file.remove(filter_db_file)
rm(filter_con, envir = .GlobalEnv)
df <- df %>% dplyr::filter(ord %in% filt_ord)
df <- df[, !(colnames(df) %in% "ord")]
return(df)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.