Summary: Assessing match quality middle initials in the CenSoc-Numident (V2) file.

## Library packages 
library(tidyverse)
library(data.table)
library(censocdev)
library(cowplot)
library(gt)
library(ipumsr)
library(here)
library(janitor)
library(broom)
## read in BUNMD 
bunmd <- fread("/censoc/data/censoc_v2/bunmd_v2.csv")

bunmd %>% 
  summarize(mean(mname == ""))
## read in numident 
## special file with ssn 
numident <- fread("/censoc/data/censoc_v2.1/censoc_numident_v2.1.csv") %>% 
  janitor::clean_names()

## numident 
numident <- numident %>% 
  left_join(bunmd %>% select(ssn, mname), by = "ssn")

## read in 1940 census 
census_1940 <- fread("/ipums-repo2019/1940/TSV/P.tsv", select = c("NAMEFRST", "HISTID", "SERIALP", "AGE", "INCWAGE", "SEX", "EDUC", "RACE", "RELATE", "MARST", "SEI")) %>% 
  janitor::clean_names()

censoc_numident <- numident %>% 
  left_join(census_1940, by = "histid")
get_second_word <- function(x)
{
  x.split <- strsplit(x, split = " ")
  x1.list <- lapply(x.split,`[`, 2) # returns NA if no name
  x1 <- unlist(x1.list)
  return(x1)
}

clean_key <- function(key){
  return(gsub(" +|(?!_)[[:punct:]]","", key, perl = T))
}
## Select First letter of middle name (census)
censoc_numident$middle_name_census <- get_second_word(censoc_numident$namefrst)
censoc_numident$middle_name_census <- clean_key(censoc_numident$middle_name_census)
censoc_numident$middle_name_census <- substring(censoc_numident$middle_name_census, 1, 1)

## Select First letter of middle name (ss5)
censoc_numident$middle_name_numident <- substring(censoc_numident$mname, first = 1, 1)
censoc_numident %>% 
  count(middle_name_numident) %>% 
  arrange(desc(n))

censoc_numident %>% 
  count(middle_name_census) %>% 
  arrange(desc(n))

censoc_numident <- censoc_numident %>% 
  mutate(middle_name_numident_recode = case_when(
    str_detect(middle_name_numident, "[:alpha:]") ~ middle_name_numident,
    TRUE ~ NA_character_
  ),
  middle_name_census_recode = case_when(
    str_detect(middle_name_census, "[:alpha:]") ~ middle_name_census,
    TRUE ~ NA_character_
  ))

censoc_numident %>% 
  summarize(mean(!is.na(middle_name_numident_recode)),
            mean(!is.na(middle_name_census_recode)),
            mean(!is.na(middle_name_census_recode) & !is.na(middle_name_numident_recode)))
censoc_numident_middle <- censoc_numident %>% 
  filter(sex.x == 1) %>% 
  filter(!is.na(middle_name_census_recode) & !is.na(middle_name_numident_recode))

censoc_numident_middle <- censoc_numident_middle %>% 
  mutate(agreement = case_when(
    middle_name_census_recode == middle_name_numident_recode ~ 1,
    TRUE ~ 0))
censoc_numident_middle %>% 
  summarize(mean(agreement))

censoc_numident_middle %>% 
  group_by(link_abe_exact_conservative) %>% 
  summarize(mean(agreement))
match_rate <- censoc_numident_middle %>% 
  filter(link_abe_exact_conservative == 1) %>% 
  group_by(byear) %>% 
  summarize(match_rate = mean(agreement),
            se = sd(agreement)/sqrt(n())) %>% 
  mutate(match = "conservative")

match_rate_standard <- censoc_numident_middle %>% 
  group_by(byear) %>% 
  summarize(match_rate = mean(agreement),
            se = sd(agreement)/sqrt(n())) %>% 
  mutate(match = "standard")

match_rate_standard_not_conservative <- censoc_numident_middle %>% 
  filter(link_abe_exact_conservative == 0) %>% 
  group_by(byear) %>% 
  summarize(match_rate = mean(agreement),
            se = sd(agreement)/sqrt(n())) %>% 
  mutate(match = "standard, not conservative")

middle_initial_agreement_plot <- match_rate %>% 
  bind_rows(match_rate_standard) %>% 
  bind_rows(match_rate_standard_not_conservative) %>% 
  filter(byear %in% c(1900:1939)) %>% 
  ggplot(aes(x = byear, y = match_rate, ymin = match_rate - 1.96*se, ymax = match_rate + 1.96*se,  color = match, shape = match)) + 
  geom_pointrange() + 
  geom_line() + 
  theme_cowplot() + 
  ggsci::scale_color_lancet() + 
  labs(x = "Birth Cohort",
       y = "Middle Initial Agreement Rate",
       title = "Middle Initial Agreement, CenSoc-Numident (Men)") + 
  theme(legend.position = "bottom", legend.title = element_blank()) + 
  ylim(0.3, 1)


## save plot 
ggsave(middle_initial_agreement_plot, filename = here("vignettes/assess_match_quality/figs/numident_middle_initial_plot.pdf"), height = 5, width = 7)
censoc_numident_middle <- censoc_numident_middle %>% 
  recode_education(educ_var = educ)

educ_middle_conservative <- censoc_numident_middle %>% 
  filter(link_abe_exact_conservative == 1) %>% 
  group_by(educ_yrs) %>% 
  summarize(agreement_mean = mean(agreement),
            se = sd(agreement)/sqrt(n())) %>% 
  mutate(method = "conservative")

educ_middle_standard_noconserv <- censoc_numident_middle %>% 
  filter(link_abe_exact_conservative == 0) %>% 
  group_by(educ_yrs) %>% 
  summarize(agreement_mean = mean(agreement),
            se = sd(agreement)/sqrt(n())) %>% 
  mutate(method = "standard, not conservative")

educ_middle <- censoc_numident_middle %>% 
  group_by(educ_yrs) %>% 
  summarize(agreement_mean = mean(agreement),
            se = sd(agreement)/sqrt(n())) %>% 
    mutate(method = "standard")

middle_initial_plot <- educ_middle %>% 
  bind_rows(educ_middle_conservative) %>% 
 #  bind_rows(educ_middle_standard_noconserv) %>% 
  filter(!is.na(educ_yrs)) %>% 
ggplot(aes(x = educ_yrs,
           y = agreement_mean,
           ymin = agreement_mean - 1.96*se,
           ymax = agreement_mean + 1.96*se,
           color = method,
           shape = method)) + 
  geom_pointrange() + 
  geom_line() + 
  theme_cowplot() + 
  ggsci::scale_color_lancet() + 
  labs(x = "Years of education",
       y = "Middle Initial Agreement Rate",
       title = "Middle Initial Agreement, CenSoc-Numident (Men)") + 
  theme(legend.position = "bottom") + 
  theme(legend.position = "bottom", legend.title = element_blank())

middle_initial_plot <- educ_middle %>% 
  bind_rows(educ_middle_conservative) %>% 
 #  bind_rows(educ_middle_standard_noconserv) %>% 
  filter(!is.na(educ_yrs)) %>% 
ggplot(aes(x = educ_yrs,
           y = agreement_mean,
           ymin = agreement_mean - 1.96*se,
           ymax = agreement_mean + 1.96*se,
           color = method)) + 
  geom_pointrange() + 
  geom_line() + 
  theme_cowplot() + 
  ggsci::scale_color_lancet() + 
  labs(x = "Years of education",
       y = "Middle Initial Agreement Rate",
       title = "Middle Initial Agreement, CenSoc-Numident (Men)") + 
  theme(legend.position = "bottom") + 
  theme(legend.position = "bottom", legend.title = element_blank())


## save plot 
ggsave(middle_initial_plot, filename = here("vignettes/assess_match_quality/figs/numident_education_middle_initial_plot.pdf"), height = 5, width = 7)
 middle_initial_agreement_byear_flex <- censoc_numident_middle %>% 
    filter(link_abe_exact_conservative == 1) %>% 
  group_by(byeardiff_census_minus_numident) %>% 
  summarize(middle = mean(agreement),
            se = sd(agreement)/n()) %>% 
  ggplot(aes(x = byeardiff_census_minus_numident, 
             y = middle,
             ymin = middle - 1.96*se,
             ymax = middle + 1.96*se,
             )) + 
  geom_pointrange(size = 1) + 
  theme_cowplot() + 
  ylim(0, 1) + 
  labs(x = "Birth Year Difference (Census - Numident)",
       y = "Middle Initial Agreemen")

## save plot 
ggsave(middle_initial_agreement_byear_flex, filename = here("vignettes/assess_match_quality/figs/numident_byearflex_middle_initial_plot.pdf"), height = 6, width = 8)
standard <- censoc_numident_middle %>% 
  filter(byear %in% 1900:1920)

conservative <- censoc_numident_middle %>% 
  filter(link_abe_exact_conservative == 1) %>% 
  filter(byear %in% 1900:1920)

standard_not_conservative <- censoc_numident_middle %>% 
  filter(link_abe_exact_conservative == 0) %>% 
  filter(byear %in% 1900:1920)

education_standard <- standard %>% 
  group_by(agreement) %>%
  do(fitHour = tidy(lm(death_age ~ educ_yrs + as.factor(byear), data = .))) %>% 
  unnest(fitHour) %>% 
  mutate(method = "standard")

education_conservative <- conservative %>% 
  group_by(agreement) %>%
  do(fitHour = tidy(lm(death_age ~ educ_yrs + as.factor(byear), data = .))) %>% 
  unnest(fitHour) %>% 
  mutate(method = "conservative")

education_standard_conservative <- standard_not_conservative %>% 
  group_by(agreement) %>%
  do(fitHour = tidy(lm(death_age ~ educ_yrs + as.factor(byear), data = .))) %>% 
  unnest(fitHour) %>% 
  mutate(method = "standard, not conservative")

education_pooled_standard <- tidy(lm(death_age ~ educ_yrs + as.factor(byear), standard)) %>% 
  mutate(method = "standard") 

education_pooled_conservative <- tidy(lm(death_age ~ educ_yrs + as.factor(byear), conservative)) %>% 
  mutate(method = "conservative")

education_pooled_standard_noconservative <- tidy(lm(death_age ~ educ_yrs + as.factor(byear), standard_not_conservative)) %>% 
  mutate(method = "standard, not conservative")

association_between_educ_and_longevity <- education_standard %>%  
  bind_rows(education_conservative) %>% 
  bind_rows(education_standard_conservative) %>% 
  bind_rows(education_pooled_standard) %>% 
  bind_rows(education_pooled_conservative) %>% 
  bind_rows(education_pooled_standard_noconservative) %>% 
  mutate(agreement = as.factor(agreement)) %>% 
  filter(term == "educ_yrs") %>% 
  mutate(agreement = case_when(
    agreement == 1 ~ "Agree",
    agreement == 0 ~"Disagree",
    TRUE ~ "Pooled")) %>% 
  ggplot(aes(x = method,
             y = estimate, 
             ymin = estimate - 1.96 * std.error,
             ymax = estimate + 1.96*std.error,
             color = agreement,
             shape = agreement)) + 
  geom_pointrange(position = position_dodge(0.1)) + 
  ggsci::scale_color_lancet() + 
  cowplot::theme_cowplot() + 
  theme(legend.position = "bottom") + 
  ylim(0, 0.15) + 
  labs(x = "Match Method",
       y = "Coefficient",
       title = "Association between years of education and longevity (OLS)",
       subtitle = "CenSoc-Numident, Birth cohorts of 1900-1920 (Men Only)",
       col = "Middle Initial Agreement",
       shape = "Middle Initial Agreement")

## save plot 
ggsave(association_between_educ_and_longevity, filename = here("vignettes/assess_match_quality/figs/numident_association_between_educ_and_longevity.pdf"), height = 6, width = 8.5)
## different in birth year 
education_standard <- censoc_numident_middle %>% 
  group_by(byeardiff_census_minus_numident) %>%
  do(fitHour = tidy(lm(death_age ~ educ_yrs + as.factor(byear), data = .))) %>% 
  unnest(fitHour) %>% 
  filter(term == "educ_yrs") %>% 
  mutate(method = "standard")

education_conservative <- censoc_numident_middle %>% 
    filter(byear %in% 1900:1920) %>% 
    filter(link_abe_exact_conservative == 1) %>% 
  group_by(byeardiff_census_minus_numident) %>%
  do(fitHour = tidy(lm(death_age ~ educ_yrs + as.factor(byear), data = .))) %>% 
  unnest(fitHour) %>% 
  filter(term == "educ_yrs") %>% 
  mutate(method = "conservative")

education_gradient_birth_difference <- education_standard %>%  
  bind_rows(education_conservative) %>% 
  ggplot(aes(x = byeardiff_census_minus_numident, 
             y = estimate,
             ymin = estimate - 1.96*std.error,
             ymax = estimate + 1.96*std.error,
         color = method)) + 
  theme_cowplot() + 
  geom_pointrange(position = position_dodge(0.15)) + 
  ggsci::scale_color_lancet() + 
  labs(x = "Census Birth Year - Numident Birth Year",
       y = "Regression Coefficient",
       color = "Match Method"
  ) + 
  theme(legend.position = "bottom")

## save plot 
ggsave(education_gradient_birth_difference, filename = here("vignettes/assess_match_quality/figs/numident_education_gradient_birth_difference.pdf"), height = 5, width = 7)


caseybreen/wcensoc documentation built on Nov. 21, 2024, 5:15 a.m.