[^updated]: Last updated: r format(Sys.time(), '%d %B, %Y')

| Page| Variable | Label | |----:|:-------------|:---------------------------------------------| | \hyperlink{page.2}{2} | \hyperlink{page.2}{HISTID} |Historical unique identifier | | \hyperlink{page.3}{3} | \hyperlink{page.3}{byear} |Year of birth | | \hyperlink{page.4}{4} | \hyperlink{page.4}{bmonth} |Month of birth | | \hyperlink{page.5}{5} | \hyperlink{page.5}{dyear} |Year of death | | \hyperlink{page.6}{6} | \hyperlink{page.6}{dmonth} |Month of death | | \hyperlink{page.7}{7} | \hyperlink{page.7}{death_age} |Age at death (years) | | \hyperlink{page.8}{8} | \hyperlink{page.8}{weight} |CenSoc weight |

\vspace{300pt}

Summary: The CenSoc-DMF Version 3.0 dataset (N = 4,682,737) links the full-count 1940 Census to the Death Master File (DMF), a collection of death records reported to the Social Security Administration. Records were linked using the conservative variant of the ABE method developed by Abramitzky, Boustan, and Eriksson (\textcolor{blue}{2012}, \textcolor{blue}{2014}, \textcolor{blue}{2017}). To merge 1940 Census variables with this dataset, researchers can obtain a copy of the 1940 Census from IPUMS-USA and link on the individual-level, unique identifier HISTID variable.

\newpage

\huge HISTID \normalsize \vspace{12pt}

Label: Historical Unique Identifier

Description: HISTID is a unique individual-level identifier. It can be used to merge the CenSoc-DMF file with the 1940 Full-Count Census from IPUMS.

\newpage

\huge byear \normalsize \vspace{12pt}

Label: Birth Year

Description: byear reports a person's year of birth, as recorded in the Social Security Death Master File.

## Library Packages
library(tidyverse)
library(data.table)

## input path (to update)
input_path <- "/data/censoc/censoc_data_releases/censoc_dmf/censoc_dmf_v3/censoc_dmf_v3.csv"

## read in censoc_dmf_v3 data file
censoc_dmf_v3 <- read_csv(input_path)
byear_plot <- censoc_dmf_v3 %>%
    group_by(byear) %>%
    summarise(n = n()) %>%
    ggplot(aes(x = byear, y = n)) + 
    geom_line() + 
    geom_point() +
    theme_minimal(base_size = 15) + #replace with a different theme (theme_bw()) if the bbplot package isn't downloaded 
  ggtitle("Year of Birth") + 
  theme(legend.position="bottom") +
  xlab("title") + 
  labs(x = "Year", 
       y = "Count") + 
  scale_y_continuous(labels = scales::comma) + 
  scale_x_continuous(breaks = scales::pretty_breaks(n=5)) 

\vspace{75pt}

byear_plot

\newpage \huge bmonth \normalsize \vspace{12pt}

Label: Birth Month

Description: bmonth reports a person's month of birth, as recorded in the Social Security Death Master File.

## run in the console and copy and paste into documentation
bmonth_tabulated <- knitr::kable(censoc_dmf_v3 %>%
    filter(bmonth != 0) %>% 
    group_by(bmonth) %>%
    tally() %>%
    mutate(freq = signif(n*100 / sum(n), 2)) %>%
    mutate(label = c("January", "February", "March", "April", "May", "June", "July", "August", "September", "October", "November", "December")) %>%
    select(bmonth, label, n, `freq %` = freq))

\vspace{30pt}

bmonth_tabulated

\newpage

\huge dyear \normalsize \vspace{12pt}

Label: Death Year

Description: dyear reports a person's year of death, as recorded in the Social Security Death Master File.

dyear_plot <- censoc_dmf_v3 %>%
    group_by(dyear) %>%
    summarise(n = n()) %>%
    ggplot(aes(x = dyear, y = n)) + 
    geom_line() + 
    geom_point() +
    theme_minimal(base_size = 15) +  
  ggtitle("Year of Death") + 
  theme(legend.position="bottom") +
  xlab("title") + 
  labs(x = "Year", 
       y = "Count") + 
  scale_y_continuous(labels = scales::comma, limits = c(0, 250000)) + 
  scale_x_continuous(breaks = scales::pretty_breaks(n=5)) 

\vspace{75pt}

dyear_plot

\newpage

\huge dmonth \normalsize \vspace{12pt}

Label: Death Month

Description: dmonth reports a person's month of death, as recorded in the Social Security Death Master File.

dmonth_tabulated <- knitr::kable(censoc_dmf_v3 %>%
    group_by(dmonth) %>%
    tally() %>%
    mutate(freq = signif(n*100 / sum(n), 2)) %>%
    mutate(label = c("January", "February", "March", "April", "May", "June", "July", "August", "September", "October", "November", "December")) %>%
    select(dmonth, label, n, `freq %` = freq))

\vspace{30pt}

dmonth_tabulated

\newpage

\huge death_age \normalsize \vspace{12pt}

Label: Age at Death (Years)

Description: death_age reports a person's age at death in years, calculated using the birth and death information recorded in the Social Security Death Master File.

death_age_plot <- censoc_dmf_v3 %>%
    group_by(death_age) %>%
    summarise(n = n()) %>%
    ggplot(aes(x = death_age, y = n)) + 
    geom_line() + 
    geom_point() +
    theme_minimal(base_size = 15) + #replace with a different theme (theme_bw()) if the bbplot package isn't downloaded 
  ggtitle("Age at Death") + 
  theme(legend.position="bottom") +
  xlab("title") + 
  labs(x = "Age at Death", 
       y = "Count") + 
  scale_y_continuous(labels = scales::comma) + 
  scale_x_continuous(breaks = scales::pretty_breaks(n=5)) 

\vspace{75pt}

death_age_plot

\newpage

\huge weight \normalsize

\vspace{12pt}

Label: Sample Weights

Description: A post-stratification person-weight to National Center for Health Statistics (NCHS) totals for persons (1) dying between 1988-2005 (2) dying between ages 65-100. Weights are based on age at death, year of death, sex, and race, and place of birth. Please see the \textcolor{blue}{technical documentation on weights} for more information.

# min/max
weights_tabulated <- censoc_dmf_v3 %>%
  filter(!is.na(weight)) %>% 
  summarize('Min Weight' = round(min(weight),2), 'Max Weight' = round(max(weight), 2)) %>%
  mutate(id = 1:n()) %>% 
  pivot_longer(-id, names_to = "Label", values_to = "Value") %>% 
  select(Value, Label) %>% 
  add_row(Label = "No Weight Assigned", Value = NA) %>% 
  knitr::kable()

# box and whisker plot
weight.plot <- censoc_dmf_v3 %>%
  filter(!is.na(weight)) %>% 
  ggplot() +
    geom_boxplot(aes(x=weight), fill='grey92') +
    ylim(-1,1) +
    theme_minimal(15) +
    theme(axis.title.y=element_blank(), axis.text.y=element_blank(), axis.ticks.y=element_blank()) +
    ggtitle("Distribution of Weights") +
    xlab('Weight')

## plot mortality on Lexis surface
weights_lexis <- censoc_dmf_v3 %>% 
  filter(!is.na(weight)) %>% 
  group_by(death_age, dyear) %>% 
  summarize(weight = mean(weight)) %>% 
  ggplot() +
  geom_raster(aes(x = dyear, y = death_age,
                  fill = weight)) +
  ## Lexis grid
  geom_hline(yintercept = seq(65, 100, 10),
             alpha = 0.4, lty = "dotted") +
  geom_vline(xintercept = seq(1975, 2005, 10),
             alpha = 0.4, lty = "dotted") +
  geom_abline(intercept = seq(-100, 100, 10)-1910,
              alpha = 0.4, lty = "dotted") +
  scale_fill_viridis_c(option = "magma") +
  scale_x_continuous("Year", expand = c(0.02, 0),
                     breaks = seq(1975, 2005, 5)) +
  scale_y_continuous("Age", expand = c(0, 0),
                     breaks = seq(65, 100, 10)) +
  guides(fill = guide_legend(reverse = TRUE)) +
  # coord
  coord_equal() +
  # theme
  theme_void() +
  theme(
    axis.text = element_text(colour = "black"),
    axis.text.y = element_text(size = 10),
    axis.text.x = element_text(size = 10, angle = 45, hjust = .5), 
    plot.title = element_text(size = 10, vjust = 2),
    legend.text = element_text(size = 10), 
    axis.title=element_text(size = 10,face="bold")
  ) + 
  labs(X = "Year",
       Y = "Age",
       title = "Average weight by age at death and year of death") 

\vspace{50pt}

weights_tabulated

\vspace{50pt}

weights_lexis


caseybreen/wcensoc documentation built on Nov. 21, 2024, 5:15 a.m.