###########################################
### HELPER FUNCTIONS ###
###########################################
# turn a character vector into a named list of words with the
# original being the name/key and the value being the lowered / non-punctuation version
vec2Words <- function(charVec) {
charVec <- unique(charVec) # remove dupe elements of vector
charVec <- charVec[!is.na(charVec)] # remove NA elements of vector
charVec <- tolower(charVec) # lowercase everything
charVec <- gsub("[[:punct:]]", " ", charVec) # rm punctuation except hyphens and digits
charVec <- gsub("\\d+", " ", charVec) # rm digits
charVec <- gsub("\\s{2,}", " ", charVec) # rm extra spaces generated by previous line
words <- unique(unlist(strsplit(charVec, " "))) # make list of unique words
words <- words[nchar(words) > 2] # remove words with only 1 character, possibly due to subs
}
###########################################
### MAIN FUNCTIONS ###
###########################################
#' @title check for spelling errors in a character vector
#'
#' @description takes character and returns list of problematic
#' spellings and suggestions for replacement
#'
#' @param input character vector
#' @import hunspell
#' @export
checkSpelling <- function(input) {
res <- unique(unlist(hunspell(unique(input)))) # unique wrapper in case single long char vec
probs <- res[lengths(res) > 0]
if (!is.null(probs)) {
suggestions <- hunspell_suggest(probs)
names(suggestions) <- probs
return(suggestions)
} else {
message("No misspellings found!")
return(NULL)
}
}
#' @title Check a character vector against words in ImmuneSpace
#'
#' @description checkByContext compares non-stopwords in a vector to those
#' already in ImmuneSpace. Any words not already in ImmuneSpace are analyzed
#' via stringdist to find closest matches. These matches are returned in a
#' named list.
#'
#' @param input character vector
#' @importFrom stopwords stopwords
#' @importFrom stringdist stringdist
#' @importFrom SnowballC wordStem
#' @export
checkByContext <- function(input) {
# assume that user has run checkSpelling and no mis-spellings in input
# so this looks for issues like "does" in place of "doses"
# TODO: take closer look at stopwords!
if (length(input) == 1) {
words <- unique(vec2Words(input))
} else {
words <- input
}
# for ease of use
freqDF <- data.frame(R2i::ISFreqsAll, stringsAsFactors = F)
colnames(freqDF) <- c("fullWord", "frequency")
# remove non-analytical words
names(words) <- words <- words[!(words %in% stopwords::stopwords(source = "smart"))]
words <- tolower(words)
words <- words[!(words %in% names(R2i::ISFreqsAll))] # rm words that are in IS dbase
# Make DF for relating stems and most likely full words
freqDF$stems <- SnowballC::wordStem(names(R2i::ISFreqsAll))
freqDF <- freqDF[order(freqDF$stems, -freqDF$frequency), ]
# find closest stems and return possible fullwords in frequency order
res <- sapply(words, USE.NAMES = T, function(x) {
# get string distances using OSA method
dists <- stringdist::stringdist(SnowballC::wordStem(x), freqDF$stems)
poss <- as.vector(freqDF$fullWord[dists == min(dists)]) # get rid of levels
})
}
#' @title find and replace characters in a vector
#'
#' @description find all instances of a word in a vector
#' and replace it with another word
#'
#' @param input character vector
#' @param find word to find
#' @param replace replacement word
#' @export
findReplace <- function(input, find, replace) {
tmp <- sapply(input, function(x) {
gsub(find, replace, x)
})
return(unname(tmp))
}
#' @title Interactively find and replace problematic words in a vector
#'
#' @description For each word in the output list from checkSpelling()
#' find the word in the inputVector and allow the user to enter a
#' replacement word
#'
#' @param misspelledWords named list of problematic words and suggested replacements
#' @param inputVector character vector
#' @param outFile filepath for where to append lines of code
#' @export
InteractiveFindReplace.vector <- function(misspelledWords, inputVector, outFile = NULL) {
ret <- inputVector
message("NOTE: leaving the replacement field blank means do not replace.")
for (nm in names(misspelledWords)) {
message(paste0("word not found: ", nm))
message("Possible suggestions: ")
print(misspelledWords[[nm]])
rep <- readline(prompt = paste0("enter replacement for ", nm, ": "))
if (rep == "") {
rep <- nm
}
message("")
ret <- gsub(pattern = nm, replacement = rep, x = ret)
if (!is.null(outFile)) {
codeLn <- paste0("gsub('", nm, "', '", rep, "', x) }))")
cat(codeLn, file = outFile, append = TRUE)
}
}
return(ret)
}
#' @title Interactively check spelling against dictionary and ImmuneSpace-specific words
#'
#' @description Given an input vector and output directory, the user may
#' correct words that are not found in a standard dictionary or a
#' current list of ImmuneSpace specific terms.
#'
#' @param inputVector vector, only character type will be worked on
#' @param vectorName name of vector for use in output R doc
#' @param outputDir filepath for where to append lines of code
#' @export
interactiveSpellCheck.vector <- function(inputVector, vectorName, outputDir) {
# skip if not a character vector
if (typeof(inputVector) != "character") {
message("skipping non-character vector")
return(inputVector)
}
# Want file to be executable
outFile <- paste0(outputDir, "/", vectorName, ".R")
# write first lines of file
header <- paste0(
"# Changes made to ", vectorName, " using interactiveSpellCheck() \n",
"# at ", Sys.time(), "\n"
)
cat(header, file = outFile, append = TRUE)
# run regular spell-check first
message("---- Running Spell Check ---- \n")
misspelledWords <- checkSpelling(inputVector)
# do findReplace
tmpVec <- InteractiveFindReplace.vector(
misspelledWords,
inputVector,
outFile
)
# run checkByContext
message("---- Running Context Check ---- \n")
contextWords <- checkByContext(tmpVec) # fix to not flag regular words like mosquito
# do findReplace
resVec <- InteractiveFindReplace.vector(
contextWords,
tmpVec,
outFile
)
# Add newlines in case wrapped in sapply statement
cat("\n\n", file = outFile, append = TRUE)
names(resVec) <- vectorName
return(resVec)
}
#' @title Interactively find and replace problematic words in a data frame
#'
#' @description For each word in the output list from checkSpelling()
#' find the word in the inputVector and allow the user to enter a
#' replacement word
#'
#' @param misspelledWords named list of problematic words and suggested replacements
#' @param inputDF character vector
#' @param outFile filepath for where to append lines of code
#' @export
InteractiveFindReplace.df <- function(misspelledWords, inputDF, outFile = NULL) {
message("NOTE: leaving the replacement field blank means do not replace.")
ret <- inputDF
for (nm in names(misspelledWords)) {
message(paste0("word not found: ", nm))
message("Possible suggestions: ")
print(misspelledWords[[nm]])
rep <- readline(prompt = paste0("enter replacement for ", nm, ": "))
if (rep == "") {
rep <- nm
}
message("")
ret <- data.frame(lapply(ret, function(x) { # need to deal with case issues
gsub(
pattern = nm,
replacement = rep,
x
)
}))
colnames(ret) <- colnames(inputDF)
if (!is.null(outFile)) {
codeLn <- paste0(
"\ndata.frame(lapply(inputDF, function(x){ gsub(pattern = '",
nm, "', replacement = '", rep, "', x) }))"
)
cat(codeLn, file = outFile, append = TRUE)
}
}
return(ret)
}
#' @title Interactively check spelling against dictionary and ImmuneSpace-specific words
#'
#' @description Given an input dataframe and output directory, the user may
#' correct words that are not found in a standard dictionary or a
#' current list of ImmuneSpace specific terms.
#'
#' @param inputDF dataframe, only character type will be worked on
#' @param outputDir filepath for where to append lines of code
#' @param dfName name of df for use in output R doc, default NULL uses "templateName" attribute
#' @export
interactiveSpellCheck.df <- function(inputDF, outputDir, dfName = NULL) {
if (is.null(dfName)) {
dfName <- attr(inputDF, "templateName")
}
# Want file to be executable
outFile <- paste0(outputDir, "/", dfName, ".R")
# write first lines of file
header <- paste0(
"# Changes made to ", dfName, " using interactiveSpellCheck() \n",
"# at ", Sys.time(), "\n"
)
cat(header, file = outFile, append = TRUE)
# get all unique words from entire DF into one vector
words <- unique(unlist(apply(inputDF, 2, vec2Words)))
# run regular spell-check first
message("---- Running Spell Check ---- \n")
misspelledWords <- checkSpelling(words)
# do InteractivefindReplace.DF ... creates named list of find:replace pairs and then iterates
tmpDF <- InteractiveFindReplace.df(misspelledWords, inputDF, outFile)
# run checkByContext to look at words that are in dictionary, but not accurate (e.g. mistyped)
message("---- Running Context Check ---- \n")
chkdWords <- words[!(words %in% names(misspelledWords))]
contextWords <- checkByContext(chkdWords)
# do findReplace
resDF <- InteractiveFindReplace.df(contextWords, tmpDF, outFile)
# Add newlines in case wrapped in sapply statement
cat("\n\n", file = outFile, append = TRUE)
names(resDF) <- dfName
return(resDF)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.