#library(censocdev) source("~/censocdev/R/clean-names.R") library(here) library(data.table) library(tidyverse)
# Look at data bunmd <- fread("/censoc/data/numident/4_berkeley_unified_mortality_database/bunmd.csv") #set.seed(100) #sampled_data <- bunmd[sample(nrow(bunmd), 10000), ] #fwrite(sampled_data, "/censoc/data/numident/4_berkeley_unified_mortality_database/bunmd_sampled_temp.csv") bunmd %>% group_by(sex) %>% count()
tan_path <- here("codebase/02_create_censoc/titles-and-nicknames") titles <- fread(paste(tan_path, "titles.csv", sep="/")) male_nicknames <- fread(paste(tan_path, "male_nicknames.csv", sep="/")) female_nicknames <- fread(paste(tan_path, "female_nicknames.csv", sep="/")) nicknames <- data.table(rbind(male_nicknames, female_nicknames)) old_nicknames <- fread(paste(tan_path, "old_nicknames.csv", sep="/"))
#look at data head(bunmd) colnames(bunmd) bunmd %>% count(lname)
#clean fname ABE Method https://github.com/uguryi/abeR/blob/master/R/clean-names.R # Define remove_titles #' @export # Define remove_titles #' @export remove_titles <- function(row) { fname_clean <- row for (title in titles$title) { pattern <- paste0(" ", title, " ") fname_clean <- stringi::stri_replace_all_regex(fname_clean, pattern, "") } return(fname_clean) } clean_names <- function(in_path, file_name, lname_col, fname_col, sex_col, other_cols, out_path, middle, verbose = T) { # Read data if (verbose) {print("Reading file")} path_to_data <- paste(in_path, file_name, sep="/") data <- readr::read_csv(path_to_data, col_select = c(lname_col, fname_col, sex_col, other_cols, middle)) # Copy raw names before processing data$lname_clean <- data[[lname_col]] data$fname_clean <- data[[fname_col]] # Concatenate middle name to first name if middle name exists if (verbose) {print("concatenating middle names")} if (middle != "") { concat_middle_to_first <- function(row) { if (is.na(row["fname_clean"])) {first <- NA} else if (is.na(row[[middle]])) {first <- row["fname_clean"]} else {first <- paste(row["fname_clean"], row[[middle]])} return(first) } data$fname_clean <- apply(data, 1, concat_middle_to_first) # SLOW data$mname_raw <- data[[middle]] data[[middle]] <- NULL } # NOTES # 1. This statement is here to make sure that there aren't any inconsistencies # in relation to middle initials. For example, in the case of CenSoc, BUNMD already # has middle initials but the way the ABE algorithm processes first names and # initials are sometimes different from the way BUNMD stores first names and initials. # ABE turns "A J" into "A J" as first name and "J" as initial, while in BUNMD "A J" # would be first name "A" and initial "J". if (verbose) {print("initial name cleaning")} # Convert unicode to ascii to remove accents data$lname_clean <- iconv(data$lname_clean, to="ASCII//TRANSLIT") data$fname_clean <- iconv(data$fname_clean, to="ASCII//TRANSLIT") # Convert to lower case data$lname_clean <- tolower(data$lname_clean) data$fname_clean <- tolower(data$fname_clean) # Trim whitespace data$lname_clean <- stringr::str_trim(data$lname_clean) data$fname_clean <- stringr::str_trim(data$fname_clean) # Replace . with " " data$lname_clean <- gsub("\\.", " ", data$lname_clean) data$fname_clean <- gsub("\\.", " ", data$fname_clean) # Keep only letters and spaces (accents don't work) data$lname_clean <- gsub("[^a-z ]", "", data$lname_clean) data$fname_clean <- gsub("[^a-z ]", "", data$fname_clean) # Get rid of titles if (verbose) {print("removing titles")} data$fname_clean <- paste0(" ", data$fname_clean) data$fname_clean <- remove_titles(data$fname_clean) # Fix two-word names (e.g., mc donnell --> mcdonnell) if (verbose) {print("fixing two-word names")} data$lname_clean <- paste0(" ", data$lname_clean) data$lname_clean <- gsub(" st ", " st", data$lname_clean) data$lname_clean <- gsub(" ste ", " ste", data$lname_clean) data$lname_clean <- gsub(" saint ", " saint", data$lname_clean) data$lname_clean <- gsub(" virg ", " virg", data$lname_clean) data$lname_clean <- gsub(" mac ", " mac", data$lname_clean) data$lname_clean <- gsub(" mc ", " mc", data$lname_clean) data$lname_clean <- stringr::str_trim(data$lname_clean) data$fname_clean <- gsub(" st ", " st", data$fname_clean) data$fname_clean <- gsub(" ste ", " ste", data$fname_clean) data$fname_clean <- gsub(" saint ", " saint", data$fname_clean) data$fname_clean <- gsub(" virg ", " virg", data$fname_clean) data$fname_clean <- gsub(" mac ", " mac", data$fname_clean) data$fname_clean <- gsub(" mc ", " mc", data$fname_clean) data$fname_clean <- stringr::str_trim(data$fname_clean) # NOTES # 1. Many more words can be added here, such as: # la, van, du, de, del, o, pal, le, de la # Split first name (i.e., deal with middle names and initials) if (verbose) {print("splitting first names")} res <- lapply(data$fname_clean, split_first_name) # SLOW (~60s with ~3M obs) data$first <- unlist(res)[attr(unlist(res),"names") == "first"] data$mname <- unlist(res)[attr(unlist(res),"names") == "middle"] rm(res) # Merge nicknames if (verbose) {print("merging nicknames")} data_final <- merge(data, nicknames, by.x = c("first", sex_col), by.y = c("nickname", "sex"), all.x = TRUE) # SLOW (~30s with ~3M obs) data_final$fname_final <- apply(data_final, 1, merge_names_no_middle) # SLOW (~45s with ~3M obs) # Keep only the first multicharacter part of lname_clean if (verbose) {print("splitting last names")} res <- lapply(data_final$lname_clean, split_first_name) # SLOW (~60s with ~3M obs) data_final$lname_final <- unlist(res)[attr(unlist(res),"names") == "first"] rm(res) # Reorder columns, drop unnecessary ones (fname_clean, lname_clean, first, name) if (middle == "") { cols_to_keep <- c("fname_final", "mname", "lname_final", sex_col, fname_col, lname_col, other_cols) data_final <- data_final[,cols_to_keep] colnames(data_final) <- c("fname", "mname", "lname", "sex", "fname_raw", "lname_raw", other_cols) } else { cols_to_keep <- c("fname_final", "mname", "lname_final", sex_col, fname_col, "mname_raw", lname_col, other_cols) data_final <- data_final[,cols_to_keep] colnames(data_final) <- c("fname", "mname", "lname", "sex", "fname_raw", "mname_raw", "lname_raw", other_cols) } # Do a final trim data_final$lname <- stringr::str_trim(data_final$lname) data_final$fname <- stringr::str_trim(data_final$fname) data_final$mname <- stringr::str_trim(data_final$mname) # Save dataset if (verbose) {print("writing file")} dir.create(out_path, showWarnings = FALSE) path_to_processed_data <- paste(out_path, gsub(".csv", "_cleaned_e.csv", file_name), sep="/") readr::write_csv(data_final, path_to_processed_data) # Clean up after yourself rm(list = ls()) }
in_path <-"/censoc/data/numident/4_berkeley_unified_mortality_database" lname_col <- "lname" fname_col <- "fname" sex_col <- "sex" out_path <- "/censoc/data/numident/4_berkeley_unified_mortality_database" # other_cols <- c("ssn", "zip_residence","byear","dyear","socstate","bmonth","dmonth","bday","dday", # "number_apps","race_first","race_last","race_last_cyear","race_first_cyear", # "race_last_cmonth","race_first_cmonth","race_change","number_claims","bpl", # "father_fname","father_mname","father_lname","mother_fname","mother_mname", # "mother_lname","age_first_application","death_age","weight","ccweight", # "bpl_string","socstate_string") other_cols <- c("ssn", "father_fname","father_mname","father_lname","mother_fname","mother_mname","mother_lname") middle <- "mname" file_name <- "bunmd.csv" clean_names(in_path, file_name, lname_col, fname_col, sex_col, other_cols, out_path, middle)
bunmd_cleaned <- fread("/censoc/data/numident/4_berkeley_unified_mortality_database/bunmd_cleaned_e.csv", nrow = 1000) head(bunmd_cleaned %>% arrange(ssn))
#clean fname ABE Method https://github.com/uguryi/abeR/blob/master/R/clean-names.R clean_names_father <- function(in_path, file_name, lname_col, fname_col, other_cols, out_path, middle, verbose = T) { # Read data if (verbose) {print("reading data")} path_to_data <- paste(in_path, file_name, sep="/") data <- readr::read_csv(path_to_data) # Copy raw names before processing data$lname_clean <- data[[lname_col]] data$fname_clean <- data[[fname_col]] data$father_sex <- 1 sex_col <- "father_sex" # Concatenate middle name to first name if middle name exists if (verbose) {print("concantenating middle names")} if (middle != "") { concat_middle_to_first <- function(row) { if (is.na(row["fname_clean"])) {first <- NA} else if (is.na(row[[middle]])) {first <- row["fname_clean"]} else {first <- paste(row["fname_clean"], row[[middle]])} return(first) } data$fname_clean <- apply(data, 1, concat_middle_to_first) # SLOW data$father_mname_raw <- data[[middle]] data[[middle]] <- NULL } # NOTES # 1. This statement is here to make sure that there aren't any inconsistencies # in relation to middle initials. For example, in the case of CenSoc, BUNMD already # has middle initials but the way the ABE algorithm processes first names and # initials are sometimes different from the way BUNMD stores first names and initials. # ABE turns "A J" into "A J" as first name and "J" as initial, while in BUNMD "A J" # would be first name "A" and initial "J". # Convert unicode to ascii to remove accents if (verbose) {print('initial name cleaning')} data$lname_clean <- iconv(data$lname_clean, to="ASCII//TRANSLIT") data$fname_clean <- iconv(data$fname_clean, to="ASCII//TRANSLIT") # Convert to lower case data$lname_clean <- tolower(data$lname_clean) data$fname_clean <- tolower(data$fname_clean) # Trim whitespace data$lname_clean <- stringr::str_trim(data$lname_clean) data$fname_clean <- stringr::str_trim(data$fname_clean) # Replace . with " " data$lname_clean <- gsub("\\.", " ", data$lname_clean) data$fname_clean <- gsub("\\.", " ", data$fname_clean) # Keep only letters and spaces (accents don't work) data$lname_clean <- gsub("[^a-z ]", "", data$lname_clean) data$fname_clean <- gsub("[^a-z ]", "", data$fname_clean) # Get rid of titles if (verbose) {print("removing titles")} data$fname_clean <- paste0(" ", data$fname_clean) data$fname_clean <- remove_titles(data$fname_clean) # Fix two-word names (e.g., mc donnell --> mcdonnell) if (verbose) {print("fixing two-word names")} data$lname_clean <- paste0(" ", data$lname_clean) data$lname_clean <- gsub(" st ", " st", data$lname_clean) data$lname_clean <- gsub(" ste ", " ste", data$lname_clean) data$lname_clean <- gsub(" saint ", " saint", data$lname_clean) data$lname_clean <- gsub(" virg ", " virg", data$lname_clean) data$lname_clean <- gsub(" mac ", " mac", data$lname_clean) data$lname_clean <- gsub(" mc ", " mc", data$lname_clean) data$lname_clean <- stringr::str_trim(data$lname_clean) data$fname_clean <- gsub(" st ", " st", data$fname_clean) data$fname_clean <- gsub(" ste ", " ste", data$fname_clean) data$fname_clean <- gsub(" saint ", " saint", data$fname_clean) data$fname_clean <- gsub(" virg ", " virg", data$fname_clean) data$fname_clean <- gsub(" mac ", " mac", data$fname_clean) data$fname_clean <- gsub(" mc ", " mc", data$fname_clean) data$fname_clean <- stringr::str_trim(data$fname_clean) # NOTES # 1. Many more words can be added here, such as: # la, van, du, de, del, o, pal, le, de la # Split first name (i.e., deal with middle names and initials) if (verbose) {print("splitting names")} res <- lapply(data$fname_clean, split_first_name) # SLOW (~60s with ~3M obs) data$first <- unlist(res)[attr(unlist(res),"names") == "first"] data$mid <- unlist(res)[attr(unlist(res),"names") == "middle"] rm(res) # Merge nicknames if (verbose) {print("merging nicknames")} data_final <- merge(data, nicknames, by.x = c("first", sex_col), by.y = c("nickname", "sex"), all.x = TRUE) # SLOW (~30s with ~3M obs) data_final$fname_final <- apply(data_final, 1, merge_names_no_middle) # SLOW (~45s with ~3M obs) # Keep only the first multicharacter part of lname_clean if (verbose) {print("splitting last names")} res <- lapply(data_final$lname_clean, split_first_name) # SLOW (~60s with ~3M obs) data_final$lname_final <- unlist(res)[attr(unlist(res),"names") == "first"] rm(res) # Reorder columns, drop unnecessary ones (fname_clean, lname_clean, first, name) if (middle == "") { cols_to_keep <- c("fname_final", "mid", "lname_final", fname_col, "father_mname_raw", lname_col, other_cols) data_final <- data_final[,cols_to_keep] colnames(data_final) <- c("father_fname", "father_mname", "father_lname", "fname_raw", "lname_raw", other_cols) } else { cols_to_keep <- c("fname_final", "mid", "lname_final", fname_col, "father_mname_raw", lname_col, other_cols) data_final <- data_final[,cols_to_keep] colnames(data_final) <- c("father_fname", "father_mname", "father_lname", "father_fname_raw", "father_mname_raw", "father_lname_raw", other_cols) } # Do a final trim data_final$lname <- stringr::str_trim(data_final$lname) data_final$fname <- stringr::str_trim(data_final$fname) data_final$mname <- stringr::str_trim(data_final$mname) # Save dataset if (verbose) {print("writing file")} dir.create(out_path, showWarnings = FALSE) path_to_processed_data <- paste(out_path, gsub(".csv", "_f.csv", file_name), sep="/") readr::write_csv(data_final, path_to_processed_data) # Clean up after yourself rm(list = ls()) }
in_path_f <-"/censoc/data/numident/4_berkeley_unified_mortality_database" lname_col_f <- "father_lname" fname_col_f <- "father_fname" out_path_f <- "/censoc/data/numident/4_berkeley_unified_mortality_database" # other_cols_f <- c("fname","mname","lname","sex","fname_raw","mname_raw","lname_raw", "ssn","zip_residence","byear","dyear","socstate","bmonth","dmonth","bday","dday", # "number_apps","race_first","race_last","race_last_cyear","race_first_cyear", # "race_last_cmonth","race_first_cmonth","race_change","number_claims","bpl", # "mother_fname","mother_mname", # "mother_lname","age_first_application","death_age","weight","ccweight", # "bpl_string","socstate_string") other_cols_f <- c("fname","mname","lname","sex","fname_raw","mname_raw","lname_raw", "ssn", "mother_fname","mother_mname", "mother_lname") middle_f <- "father_mname" file_name_f <- "bunmd_cleaned_e.csv" clean_names_father(in_path_f, file_name_f, lname_col_f, fname_col_f, other_cols_f, out_path_f, middle_f)
bunmd_cleaned_f <- fread("/censoc/data/numident/4_berkeley_unified_mortality_database/bunmd_cleaned_e_f.csv", nrow = 1000) head(bunmd_cleaned_f %>% arrange(ssn))
#clean fname ABE Method https://github.com/uguryi/abeR/blob/master/R/clean-names.R clean_names_mother <- function(in_path, file_name, lname_col, fname_col, other_cols, out_path, middle, verbose = T) { # Read data if(verbose) {print("reading data")} path_to_data <- paste(in_path, file_name, sep="/") data <- readr::read_csv(path_to_data) # Copy raw names before processing data$lname_clean <- data[[lname_col]] data$fname_clean <- data[[fname_col]] data$mother_sex <- 2 sex_col <- "mother_sex" # Concatenate middle name to first name if middle name exists if(verbose) {print("concatenating middle names")} if (middle != "") { concat_middle_to_first <- function(row) { if (is.na(row["fname_clean"])) {first <- NA} else if (is.na(row[[middle]])) {first <- row["fname_clean"]} else {first <- paste(row["fname_clean"], row[[middle]])} return(first) } data$fname_clean <- apply(data, 1, concat_middle_to_first) # SLOW data$mother_mname_raw <- data[[middle]] data[[middle]] <- NULL } # NOTES # 1. This statement is here to make sure that there aren't any inconsistencies # in relation to middle initials. For example, in the case of CenSoc, BUNMD already # has middle initials but the way the ABE algorithm processes first names and # initials are sometimes different from the way BUNMD stores first names and initials. # ABE turns "A J" into "A J" as first name and "J" as initial, while in BUNMD "A J" # would be first name "A" and initial "J". # Convert unicode to ascii to remove accents if(verbose) {print("initial name cleaning")} data$lname_clean <- iconv(data$lname_clean, to="ASCII//TRANSLIT") data$fname_clean <- iconv(data$fname_clean, to="ASCII//TRANSLIT") # Convert to lower case data$lname_clean <- tolower(data$lname_clean) data$fname_clean <- tolower(data$fname_clean) # Trim whitespace data$lname_clean <- stringr::str_trim(data$lname_clean) data$fname_clean <- stringr::str_trim(data$fname_clean) # Replace . with " " data$lname_clean <- gsub("\\.", " ", data$lname_clean) data$fname_clean <- gsub("\\.", " ", data$fname_clean) # Keep only letters and spaces (accents don't work) data$lname_clean <- gsub("[^a-z ]", "", data$lname_clean) data$fname_clean <- gsub("[^a-z ]", "", data$fname_clean) # Get rid of titles if(verbose) {print("removing titles")} data$fname_clean <- paste0(" ", data$fname_clean) data$fname_clean <- remove_titles(data$fname_clean) # Fix two-word names (e.g., mc donnell --> mcdonnell) data$lname_clean <- paste0(" ", data$lname_clean) data$lname_clean <- gsub(" st ", " st", data$lname_clean) data$lname_clean <- gsub(" ste ", " ste", data$lname_clean) data$lname_clean <- gsub(" saint ", " saint", data$lname_clean) data$lname_clean <- gsub(" virg ", " virg", data$lname_clean) data$lname_clean <- gsub(" mac ", " mac", data$lname_clean) data$lname_clean <- gsub(" mc ", " mc", data$lname_clean) data$lname_clean <- stringr::str_trim(data$lname_clean) data$fname_clean <- gsub(" st ", " st", data$fname_clean) data$fname_clean <- gsub(" ste ", " ste", data$fname_clean) data$fname_clean <- gsub(" saint ", " saint", data$fname_clean) data$fname_clean <- gsub(" virg ", " virg", data$fname_clean) data$fname_clean <- gsub(" mac ", " mac", data$fname_clean) data$fname_clean <- gsub(" mc ", " mc", data$fname_clean) data$fname_clean <- stringr::str_trim(data$fname_clean) # NOTES # 1. Many more words can be added here, such as: # la, van, du, de, del, o, pal, le, de la # Split first name (i.e., deal with middle names and initials) if(verbose) {print("splitting first name")} res <- lapply(data$fname_clean, split_first_name) # SLOW (~60s with ~3M obs) data$first <- unlist(res)[attr(unlist(res),"names") == "first"] data$mid <- unlist(res)[attr(unlist(res),"names") == "middle"] rm(res) # Merge nicknames if(verbose) {print("merging nicknames")} data_final <- merge(data, nicknames, by.x = c("first", sex_col), by.y = c("nickname", "sex"), all.x = TRUE) # SLOW (~30s with ~3M obs) data_final$fname_final <- apply(data_final, 1, merge_names_no_middle) # SLOW (~45s with ~3M obs) # Keep only the first multicharacter part of lname_clean if(verbose) {print("splitting last names")} res <- lapply(data_final$lname_clean, split_first_name) # SLOW (~60s with ~3M obs) data_final$lname_final <- unlist(res)[attr(unlist(res),"names") == "first"] rm(res) # Reorder columns, drop unnecessary ones (fname_clean, lname_clean, first, name) if (middle == "") { cols_to_keep <- c("fname_final", "mid", "lname_final", fname_col, "mother_mname_raw", lname_col, other_cols) data_final <- data_final[,cols_to_keep] colnames(data_final) <- c("mother_fname", "mother_mname", "mother_lname", "fname_raw", "lname_raw", other_cols) } else { cols_to_keep <- c("fname_final", "mid", "lname_final", fname_col, "mother_mname_raw", lname_col, other_cols) data_final <- data_final[,cols_to_keep] colnames(data_final) <- c("mother_fname", "mother_mname", "mother_lname", "mother_fname_raw", "mother_mname_raw", "mother_lname_raw", other_cols) } # Do a final trim data_final$lname <- stringr::str_trim(data_final$lname) data_final$fname <- stringr::str_trim(data_final$fname) data_final$mname <- stringr::str_trim(data_final$mname) # Save dataset if(verbose) {print("saving dataset")} dir.create(out_path, showWarnings = FALSE) path_to_processed_data <- paste(out_path, gsub(".csv", "_m.csv", file_name), sep="/") readr::write_csv(data_final, path_to_processed_data) # Clean up after yourself rm(list = ls()) }
in_path_f <-"/censoc/data/numident/4_berkeley_unified_mortality_database" lname_col_f <- "mother_lname" fname_col_f <- "mother_fname" sex_col_f <- "mother_sex" out_path_f <- "/censoc/data/numident/4_berkeley_unified_mortality_database" # other_cols_f <- c("fname","mname","lname","sex","fname_raw","mname_raw","lname_raw","father_fname", # "father_mname","father_lname", "father_fname_raw", # "father_mname_raw","father_lname_raw", # "ssn","zip_residence", # "byear","dyear","socstate","bmonth","dmonth","bday","dday", # "number_apps","race_first","race_last","race_last_cyear","race_first_cyear", # "race_last_cmonth","race_first_cmonth","race_change","number_claims","bpl", # "age_first_application","death_age","weight","ccweight", # "bpl_string","socstate_string") other_cols_f <- c("fname","mname","lname","sex","fname_raw","mname_raw","lname_raw", "father_fname", "father_mname","father_lname", "father_fname_raw", "father_mname_raw","father_lname_raw", "ssn") middle_f <- "mother_mname" file_name_f <- "bunmd_cleaned_e_f.csv" clean_names_mother(in_path_f, file_name_f, lname_col_f, fname_col_f, other_cols_f, out_path_f, middle_f)
bunmd_cleaned_m <- fread("/censoc/data/numident/4_berkeley_unified_mortality_database/bunmd_cleaned_e_f_m.csv") head(bunmd_cleaned_m %>% arrange(ssn))
#bunmd_names_cleaned <- bunmd_cleaned_m[,c("fname","mname","lname","fname_raw","mname_raw","lname_raw","sex","ssn","zip_residence","byear","dyear","socstate","bmonth","dmonth","bday","dday","number_apps","race_first","race_last","race_last_cyear","race_first_cyear","race_last_cmonth","race_first_cmonth","race_change" , "number_claims","bpl", "age_first_application","death_age","weight","ccweight","bpl_string","socstate_string", "father_fname","father_mname","father_lname","father_fname_raw","father_mname_raw","father_lname_raw","father_sex","mother_fname","mother_mname","mother_lname","mother_fname_raw","mother_mname_raw","mother_lname_raw","mother_sex")] #fwrite(bunmd_names_cleaned, "/censoc/data/numident/4_berkeley_unified_mortality_database/bunmd_names_cleaned.csv")
invalid_name_strings <- c("UNK", "UNKNOWN", "UNKNWON","UNKOWN", "WITHHELD", "MISSING", "NS", "NOTSTATED", "NOT STATED", "NOT KNOWN", "NOT LISTED", "NOT RECORD", "NOT RECORDED", "NOT SHOWN", "NOT GIVEN", "NOT PROVIDED", "NOT NAMED", "NOT AVAILABLE", "NOT VERIFIED", "NOT AVAIL", "NOT ESTABLISHED", "NOT SHOWN", "NOT", "STATED", "KNOWN", "LISTED", "RECORD", "RECORDED", "SHOWN", "GIVEN", "PROVIDED", "NAMED", "AVAILABLE", "VERIFIED", "AVAIL", "ESTABLISHED", "SHOWN", "AVAIL", "APPL", "FURNISHED", "SHOWN ON") bunmd_cleaned_m[father_fname_raw %in% invalid_name_strings, father_fname := ""] bunmd_cleaned_m[father_mname_raw %in% invalid_name_strings, father_mname := ""] bunmd_cleaned_m[father_lname_raw %in% invalid_name_strings, father_lname := ""] bunmd_cleaned_m[mother_fname_raw %in% invalid_name_strings, mother_fname := ""] bunmd_cleaned_m[mother_mname_raw %in% invalid_name_strings, mother_mname := ""] bunmd_cleaned_m[mother_lname_raw %in% invalid_name_strings, mother_lname := ""] bunmd_cleaned_m[fname_raw %in% invalid_name_strings, fname := ""] bunmd_cleaned_m[mname_raw %in% invalid_name_strings, mname := ""] bunmd_cleaned_m[lname_raw %in% invalid_name_strings, lname := ""]
bunmd_clean_name_fields <- bunmd_cleaned_m %>% select(ssn, contains("name")) %>% select(!contains("raw")) bunmd_clean_name_fields <- bunmd_clean_name_fields %>% rename_at(vars(-ssn), ~ paste0(., "_clean")) bunmd_clean_name_fields <- bunmd_clean_name_fields %>% arrange(ssn) # replace <NA> with blank strings for(j in seq_along(bunmd_clean_name_fields)){ set(bunmd_clean_name_fields, i=which(is.na(bunmd_clean_name_fields[[j]])), j=j, value="") } # rearrange columns bunmd_clean_name_fields <- bunmd_clean_name_fields[,c("ssn", "fname_clean","mname_clean", "lname_clean","father_fname_clean", "father_mname_clean","father_lname_clean", "mother_fname_clean","mother_mname_clean", "mother_lname_clean")]
readr::write_csv(bunmd_clean_name_fields, "/censoc/data/numident/4_berkeley_unified_mortality_database/bunmd_clean_name_fields_only_updated.csv")
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.