Summary: Assessment of CenSoc-DMF using the DMF as a population denominator. Since there are no covariates in the DMF, the only thing we will really do here is compute the match rates (# of matched / # of people in the DMF). As there is no sex information in the DMF, but only men are matched, I use data from HMD to approximate the male-specific match rate.

read in data

library(cowplot)
library(ggsci)
library(data.table)
library(tidyverse)
library(here)

# read full DMF (unlinked), N = 61,111,735
dmf_full <- fread('/censoc/data/dmf/ssdm.csv')

# read linked files 
dmf_linked <- fread('/censoc/data/censoc_v2.1/censoc_dmf_v2.1.csv')
# get IDs of everyne in the linked set
dmf_ssns <- dmf_linked$ssn
dmf_cons_ssns <- dmf_linked[link_abe_exact_conservative==1, ssn]


# flag standard and conservative links in the full/unmatched using ssn
dmf_full <- dmf_full %>% mutate(link_standard = ssn %in% dmf_ssns,
                                link_conservative = ssn %in% dmf_cons_ssns)

# restrict to used death years, N = 54,625,460
dmf_full_date_restricted <- dmf_full %>% filter(dyear %in% 1975:2005)

Get sex ratios from HMD

hmd <- fread('/home/ipums/mariaosborne-ipums/censoc/hmd_death_counts.txt')
hmd <- hmd %>% mutate(Age = case_when(Age == "110+" ~ 110,
                                      TRUE ~ as.numeric(Age)))


# Compute sex ratio of deaths 1975-2005 for each birth cohort
# this will be a little off since we're using cohort = year - age
hmd <- hmd %>% mutate(cohort = Year-Age)
for (c in 1910:1930) {
  cdt <- hmd %>% filter(cohort==c)
  male_deaths <- sum(cdt$Male)
  total_deaths <- sum(cdt$Total)
  sex_ratio <- male_deaths/total_deaths
  print(sex_ratio)
}

Total match rate

(date of death considered, gender NOT considered)

dmf_full_date_restricted %>% filter(byear %in% 1900:1935) %>%
  group_by(byear) %>%
  summarize(match_rate = mean(link_standard),cons_match_rate = mean(link_conservative)) %>% 
  ggplot(aes(byear)) +
  geom_point(aes(y=match_rate), color = 'red3') +
  geom_line(aes(y=match_rate), color = 'red3') +
  geom_point(aes(y=cons_match_rate), color = 'midnightblue') +
  geom_line(aes(y=cons_match_rate), color = 'midnightblue') +
  ylim(0,0.5) +
  theme_minimal() 

Approximated male-only match rate

# make data frame with matches
dmf_matches_sex <- data.frame(matrix(ncol=5, nrow=0))
for (c in 1880:1940) {
  # count up number of matched nodes for each cohort
  ctd <- dmf_full_date_restricted %>% filter(byear==c)
  std_matches <- sum(ctd$link_standard)
  cons_matches <- sum(ctd$link_conservative)
  dmf_total_deaths <- nrow(ctd)

  # compute sex ratio for this cohort
  cdt_hmd <- hmd %>% filter(cohort==c)
  male_deaths <- sum(cdt_hmd$Male)
  total_deaths <- sum(cdt_hmd$Total)
  sex_ratio <- male_deaths/total_deaths

  dmf_matches_sex <- rbind(dmf_matches_sex, c(c, std_matches, cons_matches, dmf_total_deaths, sex_ratio))
}

# rename columns
names(dmf_matches_sex) <- c('cohort', 'num_std', 'num_cons', 'num_deaths', 'sex_ratio')

# look at number of deaths, number of matches, and sex ratio per cohort
dmf_matches_sex

plot male-specific match rate

# multiple total deaths by preseumed sex ratio (% of deaths that are male according to HMD)
dmf_matches_sex <- dmf_matches_sex %>%
  mutate(num_male = num_deaths*sex_ratio)

dmf_matches_sex_standard <- dmf_matches_sex  %>% 
  mutate(match_rate = num_std/num_male) %>% 
  mutate(match_type = "standard")
dmf_matches_sex_cons <- dmf_matches_sex  %>% 
  mutate(match_rate = num_cons/num_male) %>% 
  mutate(match_type = "conservative")

dmf_match_rate_plot <- dmf_matches_sex_standard %>% 
  bind_rows(dmf_matches_sex_cons) %>% 
  filter(cohort %in% c(1900:1935)) %>% 
  ggplot(aes(x = cohort, y = match_rate, color = match_type, shape = match_type)) + 
  geom_line(size=1) + 
  geom_point(size = 2.5) + 
  theme_cowplot() + 
  ggsci::scale_color_lancet() + 
  labs(x = "Birth Cohort",
       y = "Match Rate") + 
  ylim(0, .5) + 
  theme(legend.position = "bottom", legend.title = element_blank()) 


dmf_match_rate_plot 

ggsave(dmf_match_rate_plot, 
       filename = here("vignettes/assess_match_quality/figs/dmf_denominator_match_rate_plot.pdf"),
       height = 5, width = 7)

Plot sex ratio by cohort

sex_ratio_plot <- ggplot(dmf_matches_sex %>% filter(cohort %in% 1900:1935), aes(cohort, sex_ratio))+
  geom_point(size=2.5)+
  geom_line(size = .75)+
  theme_cowplot()+
  ylim(0.3,0.7) +
  labs(x = "Birth Cohort",
       y = "Sex Ratio")

sex_ratio_plot

ggsave(sex_ratio_plot, 
       filename = here("vignettes/assess_match_quality/figs/dmf_sex_ratio_plot.pdf"),
       height = 5, width = 7)


caseybreen/wcensoc documentation built on Nov. 21, 2024, 5:15 a.m.