####################################################
# Last revised by Maria Osborne (August 14 2023) #
####################################################
#' Prepare Numident Data for weighting
#'
#' Harmonize and tabulate data for all years
#'
#' To weight:
#' 1) conservative links
#' 2) deaths age 65-100
#' 3) years 1988-2005
#'
#' Save individuals records and tabulated records by strata
#'
# libraries
library(data.table)
library(tidyverse)
# Input paths
path_numident <- "/data/censoc/censoc_data_releases/censoc_linked_to_census/v2.1/censoc_numident_v2.1_linked.csv"
path_us_bpl_codes <- "~mariaosborne-ipums/censoc_weights/data/IPUMS_detailed_BPL.txt"
# Output paths
path_out <- "~mariaosborne-ipums/censoc_weights/data/numident/"
# Read data
numident_vars <- c("HISTID", "death_age", "byear", "dyear", "sex", "bpl", "bpl_string",
"RACED", "link_abe_exact_conservative", "BPLD")
censoc_numident <- fread(path_numident, select = numident_vars)
# select conservative only, appropriate years and ages
censoc_numident_toweight <- censoc_numident[link_abe_exact_conservative == 1 &
death_age %in% 65:100 & dyear %in% 1988:2005]
# harmonize variables to match processed NCHS data
# sex string
censoc_numident_toweight[sex == 1, sex_string := "male"]
censoc_numident_toweight[sex == 2, sex_string := "female"]
# race string (use census race, not numident)
censoc_numident_toweight[RACED == 100, census_race := "white"]
censoc_numident_toweight[RACED == 200, census_race := "black"]
censoc_numident_toweight[RACED != 100 & RACED != 200, census_race := "other"]
# Translate Birthplaces into categories
# Use Census BPL, not numident bpl : numident bpl has a lot of "missing" bpls
# that are actually valid countries. For instance, over 39k people
# are born in English Canada, but have "missing" bpl in the Numident
# US birthplaces
us_bpls <- fread(path_us_bpl_codes)
us_bpls <- us_bpls[code <= 05610]
colnames(us_bpls) = c("BPLD", "usa_bpl_string")
censoc_numident_toweight <- censoc_numident_toweight %>% left_join(us_bpls, by = "BPLD")
censoc_numident_toweight[BPLD <= 05610, bpl_temp := usa_bpl_string] # us states
# unspecified US birthplaces
censoc_numident_toweight[BPLD %in% c(9000,9900), bpl_temp := "Unspecified United States"] # us states
# recode "Indian Territory" to Oklahoma
censoc_numident_toweight[BPLD == 4010, bpl_temp := "Oklahoma"]
# Current U.S. territories
censoc_numident_toweight[BPLD == 11000, bpl_temp := "Puerto Rico"]
censoc_numident_toweight[BPLD %in% c(10000:10500, 11500:12092), bpl_temp := "Other US Territory"]
# Foreign nations
# NCHS only categorizes Canada, Mexico, and Cuba. All others are aggregated.
censoc_numident_toweight[BPLD %in% 15000:15083, bpl_temp := "Canada"]
censoc_numident_toweight[BPLD == 20000, bpl_temp := "Mexico"]
censoc_numident_toweight[BPLD == 25000, bpl_temp := "Cuba"]
censoc_numident_toweight[BPLD > 15083 & BPLD != 20000 & BPLD != 25000 &
BPLD < 90000, bpl_temp := "Other Foreign Born"]
# Above 90000 is unspecified abroad or at sea. Only about 100 people.
censoc_numident_toweight[BPLD >= 90000, bpl_temp := "Unknown Abroad"]
# Check that these look okay
table(censoc_numident_toweight$bpl_temp, useNA = "always")
# Set these as keys and modify strings so we can match them across data sets
censoc_numident_toweight[, bpl_key := bpl_temp]
censoc_numident_toweight[, bpl_key := tolower(gsub(" ", "", bpl_key))] # remove spaces and capitals
# create flags for types of birthplaces
censoc_numident_toweight[, bpl_flag_usa := as.integer(BPLD <= 05610 & BPLD != 1500 & BPLD != 200)] # contiguous usa
censoc_numident_toweight[, bpl_flag_territory := as.integer(BPLD %in% 10000:12092)] # current territories, like P.R.
censoc_numident_toweight[, bpl_flag_AKHI := as.integer(BPLD %in% c(1500, 200))] # alaska & hawaii
censoc_numident_toweight[, bpl_flag_foreign := as.integer(BPLD > 12092 & BPLD < 90000)] # foreign nations
censoc_numident_toweight[, bpl_flag_missing := as.integer(bpl_temp %in% c("Unknown Abroad", "Unspecified United States"))] #missing
# make sure everything has been assigned a flag
# (this should add up to one)
mean(censoc_numident_toweight$bpl_flag_AKHI==1) + mean(censoc_numident_toweight$bpl_flag_usa==1) +
mean(censoc_numident_toweight$bpl_flag_territory==1) + mean(censoc_numident_toweight$bpl_flag_foreign==1) +
mean(censoc_numident_toweight$bpl_flag_missing==1)
# create keys
censoc_numident_toweight[, key := paste(paste0("a", death_age),
paste0("y", dyear),
paste0("s", sex_string),
paste0("r", census_race),
paste0("b", bpl_key),
sep = "_")]
# save this dataset
write_csv(censoc_numident_toweight,
paste0(path_out, "numident_data_prepped.csv"))
# Tabulate
censoc_numident_tab <- censoc_numident_toweight %>%
group_by(death_age, dyear, sex_string, census_race, bpl_key,
bpl_flag_usa, bpl_flag_territory, bpl_flag_AKHI, bpl_flag_foreign,
bpl_flag_missing, key) %>%
summarize(sample_n = n())
length(unique(censoc_numident_tab$key)) == nrow(censoc_numident_tab) # no repeat keys
length(unique(censoc_numident_tab$key)) == length(unique(censoc_numident_toweight$key)) # all keys captured
# save
write_csv(censoc_numident_tab,
paste0(path_out, "numident_data_prepared_tabulated.csv"))
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.