This Rmd is intended to clean the names of the bunmd file, resulting in the creation of cleaned name variables for an individual, their mother, and their father.

cleaned name variables: fname, middle, lname, father_fname, father_middle, father_lname, mother_fname, mother_middle, mother_lname

uncleaned name variables: fname_raw, middle_raw, lname_raw, father_fname_raw, father_middle_raw, father_lname_raw, mother_fname_raw, mother_middle_raw, mother_lname_raw

#library(censocdev) source("~/censocdev/R/clean-names.R")
library(here)
library(data.table)
library(tidyverse)
# Look at data
bunmd <- fread("/censoc/data/numident/4_berkeley_unified_mortality_database/bunmd.csv")
#set.seed(100)
#sampled_data <- bunmd[sample(nrow(bunmd), 10000), ]
#fwrite(sampled_data, "/censoc/data/numident/4_berkeley_unified_mortality_database/bunmd_sampled_temp.csv")

bunmd %>% group_by(sex) %>% count()

Read titles and nicknames

tan_path          <-  here("codebase/02_create_censoc/titles-and-nicknames")
titles <- fread(paste(tan_path, "titles.csv", sep="/"))
male_nicknames <- fread(paste(tan_path, "male_nicknames.csv", sep="/"))
female_nicknames <- fread(paste(tan_path, "female_nicknames.csv", sep="/"))
nicknames <- data.table(rbind(male_nicknames, female_nicknames))
old_nicknames <- fread(paste(tan_path, "old_nicknames.csv", sep="/"))
#look at data 
head(bunmd)
colnames(bunmd)
bunmd %>% count(lname)

Read in functions for name cleaning

#clean fname ABE Method https://github.com/uguryi/abeR/blob/master/R/clean-names.R

# Define remove_titles
#' @export

# Define remove_titles
#' @export
remove_titles <- function(row) {
  fname_clean <- row
  for (title in titles$title) {
    pattern <- paste0(" ", title, " ")
    fname_clean <- stringi::stri_replace_all_regex(fname_clean, pattern, "")
  }
  return(fname_clean)
}


clean_names <- function(in_path, file_name, lname_col, fname_col, sex_col, other_cols, out_path, middle, verbose = T) {

  # Read data
  if (verbose) {print("Reading file")}
  path_to_data <- paste(in_path, file_name, sep="/")
  data <- readr::read_csv(path_to_data, col_select = c(lname_col, fname_col, sex_col, other_cols, middle))

  # Copy raw names before processing
  data$lname_clean <- data[[lname_col]]
  data$fname_clean <- data[[fname_col]]

  # Concatenate middle name to first name if middle name exists
  if (verbose) {print("concatenating middle names")}
  if (middle != "") {
    concat_middle_to_first <- function(row) {
      if (is.na(row["fname_clean"])) {first <- NA}
      else if (is.na(row[[middle]])) {first <- row["fname_clean"]}
      else {first <- paste(row["fname_clean"], row[[middle]])}
      return(first)
    }
    data$fname_clean <- apply(data, 1, concat_middle_to_first) # SLOW
    data$mname_raw <- data[[middle]]
    data[[middle]] <- NULL
  }
  # NOTES
  # 1. This statement is here to make sure that there aren't any inconsistencies
  #    in relation to middle initials. For example, in the case of CenSoc, BUNMD already
  #    has middle initials but the way the ABE algorithm processes first names and
  #    initials are sometimes different from the way BUNMD stores first names and initials.
  #    ABE turns "A J" into "A J" as first name and "J" as initial, while in BUNMD "A J"
  #    would be first name "A" and initial "J".

  if (verbose) {print("initial name cleaning")}
  # Convert unicode to ascii to remove accents
  data$lname_clean <- iconv(data$lname_clean, to="ASCII//TRANSLIT")
  data$fname_clean <- iconv(data$fname_clean, to="ASCII//TRANSLIT")

  # Convert to lower case
  data$lname_clean <- tolower(data$lname_clean)
  data$fname_clean <- tolower(data$fname_clean)

  # Trim whitespace
  data$lname_clean <- stringr::str_trim(data$lname_clean)
  data$fname_clean <- stringr::str_trim(data$fname_clean)

  # Replace . with " "
  data$lname_clean <- gsub("\\.", " ", data$lname_clean)
  data$fname_clean <- gsub("\\.", " ", data$fname_clean)

  # Keep only letters and spaces (accents don't work)
  data$lname_clean <- gsub("[^a-z ]", "", data$lname_clean)
  data$fname_clean <- gsub("[^a-z ]", "", data$fname_clean)

  # Get rid of titles
  if (verbose) {print("removing titles")}
  data$fname_clean <- paste0(" ", data$fname_clean)
  data$fname_clean <- remove_titles(data$fname_clean)

  # Fix two-word names (e.g., mc donnell --> mcdonnell)
  if (verbose) {print("fixing two-word names")}
  data$lname_clean <- paste0(" ", data$lname_clean)
  data$lname_clean <- gsub(" st ",    " st",    data$lname_clean)
  data$lname_clean <- gsub(" ste ",   " ste",   data$lname_clean)
  data$lname_clean <- gsub(" saint ", " saint", data$lname_clean)
  data$lname_clean <- gsub(" virg ",  " virg",  data$lname_clean)
  data$lname_clean <- gsub(" mac ",   " mac",   data$lname_clean)
  data$lname_clean <- gsub(" mc ",    " mc",    data$lname_clean)
  data$lname_clean <- stringr::str_trim(data$lname_clean)
  data$fname_clean <- gsub(" st ",    " st",    data$fname_clean)
  data$fname_clean <- gsub(" ste ",   " ste",   data$fname_clean)
  data$fname_clean <- gsub(" saint ", " saint", data$fname_clean)
  data$fname_clean <- gsub(" virg ",  " virg",  data$fname_clean)
  data$fname_clean <- gsub(" mac ",   " mac",   data$fname_clean)
  data$fname_clean <- gsub(" mc ",    " mc",    data$fname_clean)
  data$fname_clean <- stringr::str_trim(data$fname_clean)
  # NOTES
  # 1. Many more words can be added here, such as:
  #    la, van, du, de, del, o, pal, le, de la

   # Split first name (i.e., deal with middle names and initials)
  if (verbose) {print("splitting first names")}
  res <- lapply(data$fname_clean, split_first_name) # SLOW (~60s with ~3M obs)
  data$first  <- unlist(res)[attr(unlist(res),"names") == "first"]
  data$mname <- unlist(res)[attr(unlist(res),"names") == "middle"]
  rm(res)

  # Merge nicknames
  if (verbose) {print("merging nicknames")}
  data_final <- merge(data, nicknames, by.x = c("first", sex_col),
                      by.y = c("nickname", "sex"), all.x = TRUE) # SLOW (~30s with ~3M obs)
  data_final$fname_final <- apply(data_final, 1, merge_names_no_middle) # SLOW (~45s with ~3M obs)

  # Keep only the first multicharacter part of lname_clean
  if (verbose) {print("splitting last names")}
  res <- lapply(data_final$lname_clean, split_first_name) # SLOW (~60s with ~3M obs)
  data_final$lname_final <- unlist(res)[attr(unlist(res),"names") == "first"]
  rm(res)
   # Reorder columns, drop unnecessary ones (fname_clean, lname_clean, first, name)
  if (middle == "") {
    cols_to_keep <- c("fname_final", "mname", "lname_final", sex_col, fname_col, lname_col, other_cols)
    data_final <- data_final[,cols_to_keep]
    colnames(data_final) <- c("fname", "mname", "lname", "sex", "fname_raw", "lname_raw", other_cols)
  } else {
    cols_to_keep <- c("fname_final", "mname", "lname_final", sex_col, fname_col, "mname_raw", lname_col, other_cols)
    data_final <- data_final[,cols_to_keep]
    colnames(data_final) <- c("fname", "mname", "lname", "sex", "fname_raw", "mname_raw", "lname_raw", other_cols)
  }
 # Do a final trim
  data_final$lname <- stringr::str_trim(data_final$lname)
  data_final$fname <- stringr::str_trim(data_final$fname)
  data_final$mname <- stringr::str_trim(data_final$mname)

  # Save dataset
  if (verbose) {print("writing file")}
  dir.create(out_path, showWarnings = FALSE)
  path_to_processed_data <- paste(out_path, gsub(".csv", "_cleaned_e.csv", file_name), sep="/")
  readr::write_csv(data_final, path_to_processed_data)

  # Clean up after yourself
  rm(list = ls())
}

Apply code to BUNMD fname lname mname

in_path <-"/censoc/data/numident/4_berkeley_unified_mortality_database"
lname_col <- "lname"
fname_col <- "fname"
sex_col <- "sex"
out_path      <- "/censoc/data/numident/4_berkeley_unified_mortality_database"
# other_cols <- c("ssn", "zip_residence","byear","dyear","socstate","bmonth","dmonth","bday","dday",
#                 "number_apps","race_first","race_last","race_last_cyear","race_first_cyear",
#                 "race_last_cmonth","race_first_cmonth","race_change","number_claims","bpl",
#                 "father_fname","father_mname","father_lname","mother_fname","mother_mname",
#                 "mother_lname","age_first_application","death_age","weight","ccweight",
#                 "bpl_string","socstate_string")
other_cols <- c("ssn", "father_fname","father_mname","father_lname","mother_fname","mother_mname","mother_lname")
middle <- "mname"
file_name <- "bunmd.csv"
clean_names(in_path, file_name, lname_col, fname_col,
            sex_col, other_cols, out_path, middle)

Check cleaned dataset

bunmd_cleaned <- fread("/censoc/data/numident/4_berkeley_unified_mortality_database/bunmd_cleaned_e.csv", 
                       nrow = 1000)
head(bunmd_cleaned %>% arrange(ssn))

REPEAT W FATHER NAMES - create function clean_names_father

#clean fname ABE Method https://github.com/uguryi/abeR/blob/master/R/clean-names.R

clean_names_father <- function(in_path, file_name, lname_col, fname_col, other_cols, out_path, middle, verbose = T) {

  # Read data
  if (verbose) {print("reading data")}
  path_to_data <- paste(in_path, file_name, sep="/")
  data <- readr::read_csv(path_to_data)

  # Copy raw names before processing
  data$lname_clean <- data[[lname_col]]
  data$fname_clean <- data[[fname_col]]

  data$father_sex <- 1
  sex_col <- "father_sex"

  # Concatenate middle name to first name if middle name exists
  if (verbose) {print("concantenating middle names")}
  if (middle != "") {
    concat_middle_to_first <- function(row) {
      if (is.na(row["fname_clean"])) {first <- NA}
      else if (is.na(row[[middle]])) {first <- row["fname_clean"]}
      else {first <- paste(row["fname_clean"], row[[middle]])}
      return(first)
    }
    data$fname_clean <- apply(data, 1, concat_middle_to_first) # SLOW
    data$father_mname_raw <- data[[middle]]
    data[[middle]] <- NULL
  }
  # NOTES
  # 1. This statement is here to make sure that there aren't any inconsistencies
  #    in relation to middle initials. For example, in the case of CenSoc, BUNMD already
  #    has middle initials but the way the ABE algorithm processes first names and
  #    initials are sometimes different from the way BUNMD stores first names and initials.
  #    ABE turns "A J" into "A J" as first name and "J" as initial, while in BUNMD "A J"
  #    would be first name "A" and initial "J".

  # Convert unicode to ascii to remove accents
  if (verbose) {print('initial name cleaning')}
  data$lname_clean <- iconv(data$lname_clean, to="ASCII//TRANSLIT")
  data$fname_clean <- iconv(data$fname_clean, to="ASCII//TRANSLIT")

  # Convert to lower case
  data$lname_clean <- tolower(data$lname_clean)
  data$fname_clean <- tolower(data$fname_clean)

  # Trim whitespace
  data$lname_clean <- stringr::str_trim(data$lname_clean)
  data$fname_clean <- stringr::str_trim(data$fname_clean)

  # Replace . with " "
  data$lname_clean <- gsub("\\.", " ", data$lname_clean)
  data$fname_clean <- gsub("\\.", " ", data$fname_clean)

  # Keep only letters and spaces (accents don't work)
  data$lname_clean <- gsub("[^a-z ]", "", data$lname_clean)
  data$fname_clean <- gsub("[^a-z ]", "", data$fname_clean)

  # Get rid of titles
  if (verbose) {print("removing titles")}
  data$fname_clean <- paste0(" ", data$fname_clean)
  data$fname_clean <- remove_titles(data$fname_clean)

  # Fix two-word names (e.g., mc donnell --> mcdonnell)
  if (verbose) {print("fixing two-word names")}
  data$lname_clean <- paste0(" ", data$lname_clean)
  data$lname_clean <- gsub(" st ",    " st",    data$lname_clean)
  data$lname_clean <- gsub(" ste ",   " ste",   data$lname_clean)
  data$lname_clean <- gsub(" saint ", " saint", data$lname_clean)
  data$lname_clean <- gsub(" virg ",  " virg",  data$lname_clean)
  data$lname_clean <- gsub(" mac ",   " mac",   data$lname_clean)
  data$lname_clean <- gsub(" mc ",    " mc",    data$lname_clean)
  data$lname_clean <- stringr::str_trim(data$lname_clean)
  data$fname_clean <- gsub(" st ",    " st",    data$fname_clean)
  data$fname_clean <- gsub(" ste ",   " ste",   data$fname_clean)
  data$fname_clean <- gsub(" saint ", " saint", data$fname_clean)
  data$fname_clean <- gsub(" virg ",  " virg",  data$fname_clean)
  data$fname_clean <- gsub(" mac ",   " mac",   data$fname_clean)
  data$fname_clean <- gsub(" mc ",    " mc",    data$fname_clean)
  data$fname_clean <- stringr::str_trim(data$fname_clean)
  # NOTES
  # 1. Many more words can be added here, such as:
  #    la, van, du, de, del, o, pal, le, de la

   # Split first name (i.e., deal with middle names and initials)
  if (verbose) {print("splitting names")}
  res <- lapply(data$fname_clean, split_first_name) # SLOW (~60s with ~3M obs)
  data$first  <- unlist(res)[attr(unlist(res),"names") == "first"]
  data$mid <- unlist(res)[attr(unlist(res),"names") == "middle"]
  rm(res)

  # Merge nicknames
  if (verbose) {print("merging nicknames")}
  data_final <- merge(data, nicknames, by.x = c("first", sex_col),
                      by.y = c("nickname", "sex"), all.x = TRUE) # SLOW (~30s with ~3M obs)
  data_final$fname_final <- apply(data_final, 1, merge_names_no_middle) # SLOW (~45s with ~3M obs)

  # Keep only the first multicharacter part of lname_clean
  if (verbose) {print("splitting last names")}
  res <- lapply(data_final$lname_clean, split_first_name) # SLOW (~60s with ~3M obs)
  data_final$lname_final <- unlist(res)[attr(unlist(res),"names") == "first"]
  rm(res)
   # Reorder columns, drop unnecessary ones (fname_clean, lname_clean, first, name)
  if (middle == "") {
    cols_to_keep <- c("fname_final", "mid", "lname_final", fname_col, "father_mname_raw", lname_col, other_cols)
    data_final <- data_final[,cols_to_keep]
    colnames(data_final) <- c("father_fname", "father_mname", "father_lname", "fname_raw", "lname_raw", other_cols)
  } else {
    cols_to_keep <- c("fname_final", "mid", "lname_final", fname_col, "father_mname_raw", lname_col, other_cols)
    data_final <- data_final[,cols_to_keep]
   colnames(data_final) <- c("father_fname", "father_mname", "father_lname", "father_fname_raw",
                             "father_mname_raw", "father_lname_raw", other_cols)
  }
 # Do a final trim
  data_final$lname <- stringr::str_trim(data_final$lname)
  data_final$fname <- stringr::str_trim(data_final$fname)
  data_final$mname <- stringr::str_trim(data_final$mname)

  # Save dataset
  if (verbose) {print("writing file")}
  dir.create(out_path, showWarnings = FALSE)
  path_to_processed_data <- paste(out_path, gsub(".csv", "_f.csv", file_name), sep="/")
  readr::write_csv(data_final, path_to_processed_data)

  # Clean up after yourself
  rm(list = ls())
}

apply function to clean father names

in_path_f <-"/censoc/data/numident/4_berkeley_unified_mortality_database"
lname_col_f <- "father_lname"
fname_col_f <- "father_fname"
out_path_f      <- "/censoc/data/numident/4_berkeley_unified_mortality_database"
# other_cols_f <- c("fname","mname","lname","sex","fname_raw","mname_raw","lname_raw", "ssn","zip_residence","byear","dyear","socstate","bmonth","dmonth","bday","dday",
#                 "number_apps","race_first","race_last","race_last_cyear","race_first_cyear",
#                 "race_last_cmonth","race_first_cmonth","race_change","number_claims","bpl",
#                 "mother_fname","mother_mname",
#                 "mother_lname","age_first_application","death_age","weight","ccweight",
#                 "bpl_string","socstate_string")
other_cols_f <- c("fname","mname","lname","sex","fname_raw","mname_raw","lname_raw", "ssn",
                "mother_fname","mother_mname", "mother_lname")
middle_f <- "father_mname"
file_name_f <- "bunmd_cleaned_e.csv"
clean_names_father(in_path_f, file_name_f, lname_col_f, fname_col_f,
            other_cols_f, out_path_f, middle_f)

Check cleaned dataset

bunmd_cleaned_f <- fread("/censoc/data/numident/4_berkeley_unified_mortality_database/bunmd_cleaned_e_f.csv", nrow = 1000)
head(bunmd_cleaned_f %>% arrange(ssn))

repeat for mothers- create clean_names_mother function

#clean fname ABE Method https://github.com/uguryi/abeR/blob/master/R/clean-names.R

clean_names_mother <- function(in_path, file_name, lname_col, fname_col, other_cols, out_path, middle, verbose = T) {

  # Read data
  if(verbose) {print("reading data")}
  path_to_data <- paste(in_path, file_name, sep="/")
  data <- readr::read_csv(path_to_data)

  # Copy raw names before processing
  data$lname_clean <- data[[lname_col]]
  data$fname_clean <- data[[fname_col]]

  data$mother_sex <- 2
  sex_col <- "mother_sex"

  # Concatenate middle name to first name if middle name exists
  if(verbose) {print("concatenating middle names")}
  if (middle != "") {
    concat_middle_to_first <- function(row) {
      if (is.na(row["fname_clean"])) {first <- NA}
      else if (is.na(row[[middle]])) {first <- row["fname_clean"]}
      else {first <- paste(row["fname_clean"], row[[middle]])}
      return(first)
    }
    data$fname_clean <- apply(data, 1, concat_middle_to_first) # SLOW
    data$mother_mname_raw <- data[[middle]]
    data[[middle]] <- NULL
  }
  # NOTES
  # 1. This statement is here to make sure that there aren't any inconsistencies
  #    in relation to middle initials. For example, in the case of CenSoc, BUNMD already
  #    has middle initials but the way the ABE algorithm processes first names and
  #    initials are sometimes different from the way BUNMD stores first names and initials.
  #    ABE turns "A J" into "A J" as first name and "J" as initial, while in BUNMD "A J"
  #    would be first name "A" and initial "J".

  # Convert unicode to ascii to remove accents
  if(verbose) {print("initial name cleaning")}
  data$lname_clean <- iconv(data$lname_clean, to="ASCII//TRANSLIT")
  data$fname_clean <- iconv(data$fname_clean, to="ASCII//TRANSLIT")

  # Convert to lower case
  data$lname_clean <- tolower(data$lname_clean)
  data$fname_clean <- tolower(data$fname_clean)

  # Trim whitespace
  data$lname_clean <- stringr::str_trim(data$lname_clean)
  data$fname_clean <- stringr::str_trim(data$fname_clean)

  # Replace . with " "
  data$lname_clean <- gsub("\\.", " ", data$lname_clean)
  data$fname_clean <- gsub("\\.", " ", data$fname_clean)

  # Keep only letters and spaces (accents don't work)
  data$lname_clean <- gsub("[^a-z ]", "", data$lname_clean)
  data$fname_clean <- gsub("[^a-z ]", "", data$fname_clean)

  # Get rid of titles
  if(verbose) {print("removing titles")}
  data$fname_clean <- paste0(" ", data$fname_clean)
  data$fname_clean <- remove_titles(data$fname_clean)

  # Fix two-word names (e.g., mc donnell --> mcdonnell)
  data$lname_clean <- paste0(" ", data$lname_clean)
  data$lname_clean <- gsub(" st ",    " st",    data$lname_clean)
  data$lname_clean <- gsub(" ste ",   " ste",   data$lname_clean)
  data$lname_clean <- gsub(" saint ", " saint", data$lname_clean)
  data$lname_clean <- gsub(" virg ",  " virg",  data$lname_clean)
  data$lname_clean <- gsub(" mac ",   " mac",   data$lname_clean)
  data$lname_clean <- gsub(" mc ",    " mc",    data$lname_clean)
  data$lname_clean <- stringr::str_trim(data$lname_clean)
  data$fname_clean <- gsub(" st ",    " st",    data$fname_clean)
  data$fname_clean <- gsub(" ste ",   " ste",   data$fname_clean)
  data$fname_clean <- gsub(" saint ", " saint", data$fname_clean)
  data$fname_clean <- gsub(" virg ",  " virg",  data$fname_clean)
  data$fname_clean <- gsub(" mac ",   " mac",   data$fname_clean)
  data$fname_clean <- gsub(" mc ",    " mc",    data$fname_clean)
  data$fname_clean <- stringr::str_trim(data$fname_clean)
  # NOTES
  # 1. Many more words can be added here, such as:
  #    la, van, du, de, del, o, pal, le, de la

   # Split first name (i.e., deal with middle names and initials)
  if(verbose) {print("splitting first name")}
  res <- lapply(data$fname_clean, split_first_name) # SLOW (~60s with ~3M obs)
  data$first  <- unlist(res)[attr(unlist(res),"names") == "first"]
  data$mid <- unlist(res)[attr(unlist(res),"names") == "middle"]
  rm(res)

  # Merge nicknames
  if(verbose) {print("merging nicknames")}
  data_final <- merge(data, nicknames, by.x = c("first", sex_col),
                      by.y = c("nickname", "sex"), all.x = TRUE) # SLOW (~30s with ~3M obs)
  data_final$fname_final <- apply(data_final, 1, merge_names_no_middle) # SLOW (~45s with ~3M obs)

  # Keep only the first multicharacter part of lname_clean
  if(verbose) {print("splitting last names")}
  res <- lapply(data_final$lname_clean, split_first_name) # SLOW (~60s with ~3M obs)
  data_final$lname_final <- unlist(res)[attr(unlist(res),"names") == "first"]
  rm(res)
   # Reorder columns, drop unnecessary ones (fname_clean, lname_clean, first, name)
  if (middle == "") {
    cols_to_keep <- c("fname_final", "mid", "lname_final", fname_col, "mother_mname_raw", lname_col, other_cols)
    data_final <- data_final[,cols_to_keep]
    colnames(data_final) <- c("mother_fname", "mother_mname", "mother_lname",
                              "fname_raw", "lname_raw", other_cols)
  } else {
    cols_to_keep <- c("fname_final", "mid", "lname_final", fname_col, "mother_mname_raw", lname_col, other_cols)
    data_final <- data_final[,cols_to_keep]
   colnames(data_final) <- c("mother_fname", "mother_mname", "mother_lname",
                             "mother_fname_raw", "mother_mname_raw", "mother_lname_raw", other_cols)
  }
 # Do a final trim
  data_final$lname <- stringr::str_trim(data_final$lname)
  data_final$fname <- stringr::str_trim(data_final$fname)
  data_final$mname <- stringr::str_trim(data_final$mname)

  # Save dataset
  if(verbose) {print("saving dataset")}
  dir.create(out_path, showWarnings = FALSE)
  path_to_processed_data <- paste(out_path, gsub(".csv", "_m.csv", file_name), sep="/")
  readr::write_csv(data_final, path_to_processed_data)

  # Clean up after yourself
  rm(list = ls())
}

apply function to mothers

in_path_f <-"/censoc/data/numident/4_berkeley_unified_mortality_database"
lname_col_f <- "mother_lname"
fname_col_f <- "mother_fname"
sex_col_f <- "mother_sex" 
out_path_f      <- "/censoc/data/numident/4_berkeley_unified_mortality_database"
# other_cols_f <- c("fname","mname","lname","sex","fname_raw","mname_raw","lname_raw","father_fname",
#                   "father_mname","father_lname", "father_fname_raw",
#                   "father_mname_raw","father_lname_raw",
#                   "ssn","zip_residence",
#                   "byear","dyear","socstate","bmonth","dmonth","bday","dday",
#                 "number_apps","race_first","race_last","race_last_cyear","race_first_cyear",
#                 "race_last_cmonth","race_first_cmonth","race_change","number_claims","bpl",
#                 "age_first_application","death_age","weight","ccweight",
#                 "bpl_string","socstate_string")
other_cols_f <- c("fname","mname","lname","sex","fname_raw","mname_raw","lname_raw",
                  "father_fname", "father_mname","father_lname", "father_fname_raw",
                  "father_mname_raw","father_lname_raw", "ssn")
middle_f <- "mother_mname"
file_name_f <- "bunmd_cleaned_e_f.csv"
clean_names_mother(in_path_f, file_name_f, lname_col_f, fname_col_f,
            other_cols_f, out_path_f, middle_f)

Check cleaned dataset

bunmd_cleaned_m <- fread("/censoc/data/numident/4_berkeley_unified_mortality_database/bunmd_cleaned_e_f_m.csv")
head(bunmd_cleaned_m %>% arrange(ssn))

reorder columns in clean dataset and export final file

#bunmd_names_cleaned <- bunmd_cleaned_m[,c("fname","mname","lname","fname_raw","mname_raw","lname_raw","sex","ssn","zip_residence","byear","dyear","socstate","bmonth","dmonth","bday","dday","number_apps","race_first","race_last","race_last_cyear","race_first_cyear","race_last_cmonth","race_first_cmonth","race_change" , "number_claims","bpl", "age_first_application","death_age","weight","ccweight","bpl_string","socstate_string", "father_fname","father_mname","father_lname","father_fname_raw","father_mname_raw","father_lname_raw","father_sex","mother_fname","mother_mname","mother_lname","mother_fname_raw","mother_mname_raw","mother_lname_raw","mother_sex")]
#fwrite(bunmd_names_cleaned, "/censoc/data/numident/4_berkeley_unified_mortality_database/bunmd_names_cleaned.csv")

Remove bad strings

invalid_name_strings <- c("UNK", "UNKNOWN", "UNKNWON","UNKOWN", "WITHHELD", "MISSING", "NS",
                          "NOTSTATED", "NOT STATED", "NOT KNOWN", "NOT LISTED", "NOT RECORD", "NOT RECORDED", 
                      "NOT SHOWN", "NOT GIVEN", "NOT PROVIDED", "NOT NAMED", "NOT AVAILABLE",
                      "NOT VERIFIED", "NOT AVAIL", "NOT ESTABLISHED", "NOT SHOWN", 
                      "NOT", "STATED", "KNOWN", "LISTED", "RECORD", "RECORDED", "SHOWN",
                      "GIVEN", "PROVIDED", "NAMED", "AVAILABLE", "VERIFIED", "AVAIL", "ESTABLISHED",
                      "SHOWN", "AVAIL", "APPL", "FURNISHED", "SHOWN ON")

bunmd_cleaned_m[father_fname_raw %in% invalid_name_strings, father_fname := ""]
bunmd_cleaned_m[father_mname_raw %in% invalid_name_strings, father_mname := ""]
bunmd_cleaned_m[father_lname_raw %in% invalid_name_strings, father_lname := ""]
bunmd_cleaned_m[mother_fname_raw %in% invalid_name_strings, mother_fname := ""]
bunmd_cleaned_m[mother_mname_raw %in% invalid_name_strings, mother_mname := ""]
bunmd_cleaned_m[mother_lname_raw %in% invalid_name_strings, mother_lname := ""]
bunmd_cleaned_m[fname_raw %in% invalid_name_strings, fname := ""]
bunmd_cleaned_m[mname_raw %in% invalid_name_strings, mname := ""]
bunmd_cleaned_m[lname_raw %in% invalid_name_strings, lname := ""]

Extract cleaned name fields only and order columns

bunmd_clean_name_fields <- bunmd_cleaned_m %>%
  select(ssn, contains("name")) %>%
  select(!contains("raw"))
bunmd_clean_name_fields <- bunmd_clean_name_fields %>% 
  rename_at(vars(-ssn), ~ paste0(., "_clean"))

bunmd_clean_name_fields <- bunmd_clean_name_fields %>% arrange(ssn)

# replace <NA> with blank strings
for(j in seq_along(bunmd_clean_name_fields)){
  set(bunmd_clean_name_fields, i=which(is.na(bunmd_clean_name_fields[[j]])), j=j, value="")
}

# rearrange columns
bunmd_clean_name_fields <- bunmd_clean_name_fields[,c("ssn", "fname_clean","mname_clean",
                                                      "lname_clean","father_fname_clean",
                                                      "father_mname_clean","father_lname_clean",
                                                      "mother_fname_clean","mother_mname_clean",
                                                      "mother_lname_clean")]

write file

readr::write_csv(bunmd_clean_name_fields,
          "/censoc/data/numident/4_berkeley_unified_mortality_database/bunmd_clean_name_fields_only_updated.csv")


caseybreen/wcensoc documentation built on Nov. 21, 2024, 5:15 a.m.