To create this file, we will need to read in a truncated version of the BUNMD and ZIP code of death variables files along with place of birth variables that were created using Seth Sanders' Crosswalk.
To construct the “place of birth” variables, we used a crosswalk originally developed for this paper:
Black, Dan A., Seth G. Sanders, Evan J. Taylor, and Lowell J. Taylor. 2015. “The Impact of the Great Migration on Mortality of African Americans: Evidence from the Deep South.” American Economic Review 105(2):477–503. doi: 10.1257/aer.20120642.
library(tidyverse) library(data.table) library(dplyr)
file paths
# zip features data (ZIP to state, county, region, etc) zip_features_path <- "/data/censoc/workspace/geography_variables/Zip_Features.csv" # string to GNIS feature by ssn, using Black et al. bunmd_birthplace_crosswalk_path <- "/data/censoc/workspace/geography_variables/bunmd_Birthplace.csv" # raw unclean birthplace strings unclean_birthplace_stings_path <- "/data/censoc/workspace/geography_variables/bunmd_city_string.csv" # full BUNMD V2.0 bunmd_path <- "/data/censoc/censoc_data_releases/bunmd/bunmd_v2/bunmd_v2.csv" # path to published Numident 2.1 numident_path <- "/data/censoc/censoc_data_releases/censoc_numident/censoc_numident_v2.1/censoc_numident_v2.1.csv" # bunmd to numident ID variable crosswalk ssn_crosswalk_path <- "/data/censoc/crosswalks/numident_ssn_histid_v2_1_crosswalk.csv" # state FIPS codes state_fips_path <- "/data/censoc/workspace/geography_variables/state_fips_codes.csv" # path to write files geo_files_out_path <- "/data/censoc/workspace/geography_variables/"
Here, I read in the data and select only for the variables of interest.
Zip_Features <- read.csv(zip_features_path, colClasses = c(zip = "character",FIPS = "character")) Zip_Features <- subset(Zip_Features, select=-c(RUC1974, area_name, conterminous, acceptable_cities,district_name,RUC1983,timezone,area_codes,OASDI_benefits_2005,OASDI_total_2005,type,elevation,state_full,delv_address,OASDI_average_2005,OASDI_disabled_2005,OASDI_benefits_2005,OASDI_retired_2005,geocode)) birthplaces_df <- fread(bunmd_birthplace_crosswalk_path, select=c("ssn","bpl_city","feature_id","county_name", "Region", "county_numeric", "bpl_string")) bunmd_trunc <- fread(bunmd_path, select=c("ssn", "zip_residence","bpl_string"))
Next, I rename the ZIP code death place variables so that it'll be clear that these refer to places of death in the final file.
Zip_Features <- Zip_Features %>% rename( death_fips = names(Zip_Features)[1], death_zip = names(Zip_Features)[2], death_state = names(Zip_Features)[3], death_city = names(Zip_Features)[4], death_county = names(Zip_Features)[5], death_country = names(Zip_Features)[6], death_region = names(Zip_Features)[7], death_ruc1993 = names(Zip_Features)[8] ) head(Zip_Features)
I truncate the BUNMD zip_residence variable so that it only has 5 digits. This is an important step since the places of death file only contains 5 digit ZIP codes, and gives us a variable that we can match on.
bunmd_trunc <- bunmd_trunc %>% mutate( zip = str_trunc(as.character(zip_residence), width = 5, side = "right", ellipsis = "") ) %>% dplyr::select(-c(zip_residence)) %>% rename(death_zip = zip) head(bunmd_trunc)
add birth region using bpl_string from BUNMD (IPUMS-defined regions)
west_states <- c("Arizona", "Colorado", "Idaho", "Montana", "Nevada", "New Mexico", "Utah", "Wyoming", "Alaska", "California", "Hawaii", "Oregon", "Washington") midwest_states <- c("Illinois", "Indiana", "Michigan", "Ohio", "Wisconsin", "Iowa", "Kansas", "Minnesota", "Missouri", "Nebraska", "North Dakota", "South Dakota") northeast_states <- c("Connecticut", "Maine", "Massachusetts", "New Hampshire", "Rhode Island", "Vermont", "New Jersey", "New York", "Pennsylvania") south_states <- c("Delaware", "Florida", "Georgia", "Maryland", "North Carolina", "South Carolina", "Virginia", "West Virginia", "Alabama", "Kentucky", "Mississippi", "Tennessee", "Arkansas", "Louisiana", "Oklahoma", "Texas") bunmd_trunc$birth_region <- ifelse(bunmd_trunc$bpl_string %in% west_states, "west", ifelse(bunmd_trunc$bpl_string %in% midwest_states, "midwest", ifelse(bunmd_trunc$bpl_string %in% northeast_states, "northeast", ifelse(bunmd_trunc$bpl_string %in% south_states, "south", NA))))
Once we have the matching ZIP code variables, we merge the information from the ZIP code file to the BUNMD file. This results in a new file that contains a person's SSN, ZIP code of death, and additional information sourced from the ZIP code file, such as the corresponding city, county, and state.
bunmd_trunc <- merge(bunmd_trunc, Zip_Features, by = "death_zip", all.x = TRUE,sort = FALSE) rm(Zip_Features) head(bunmd_trunc)
bunmd_trunc$death_state <- ifelse(bunmd_trunc$death_county == "DISTRICT OF COLUMBIA", "DC", bunmd_trunc$death_state)
Next, we rename birth place variables.
birthplaces_df <- birthplaces_df %>% rename( birth_city = names(birthplaces_df)[2], birth_gnis_code = names(birthplaces_df)[3], birth_county = names(birthplaces_df)[4], birth_region = names(birthplaces_df)[5], birth_fips = names(birthplaces_df)[6]) head(birthplaces_df)
merge county and state numeric codes to get birth fips
# read state FIPS codes state_fips <- fread(state_fips_path, select = c("stname", "st")) names(state_fips) <- c("bpl_string", "state_fips_code") state_fips <- state_fips %>% mutate(state_fips_code = str_pad(as.character(state_fips_code), width = 2, side = "left", pad = "0")) # pad county FIPS codes birthplaces_df <- birthplaces_df %>% mutate(birth_fips = str_pad(as.character(birth_fips), width = 3, side = "left", pad = "0")) # add state codes to birthplaces_df birthplaces_df <- birthplaces_df %>% left_join(state_fips, by = "bpl_string") # combine to get full FIPS codes birthplaces_df <- birthplaces_df %>% mutate(birth_fips = paste0(state_fips_code, birth_fips)) # drop if not a proper 5 digit FIPS code (small number of records are missing county code) birthplaces_df <- birthplaces_df %>% mutate(birth_fips = ifelse(nchar(birth_fips) < 5, "", birth_fips)) birthplaces_df$state_fips_code <- NULL
Next, we merge birth place variables to the BUNMD file. These variables include the city, county, and state where the person was born. Now, we have a new file that contains information on people's death and birthplaces, in addition to their SSN.
birthplaces_df <- merge(bunmd_trunc, birthplaces_df, by = "ssn", all.x = TRUE,sort = FALSE) rm(bunmd_trunc) head(birthplaces_df)
Let's see how well of a job the crosswalk did and count how many birth states in the BUNMD match with the birth states in the birthplaces file.
birthplaces_df$birth_place_matched <- ifelse(birthplaces_df$bpl_string.x == birthplaces_df$bpl_string.y & !is.na(birthplaces_df$bpl_string.x),0, ifelse(birthplaces_df$bpl_string.x != birthplaces_df$bpl_string.y & !is.na(birthplaces_df$bpl_string.x),1,NA)) print(paste("There are",sum(birthplaces_df$birth_place_matched,na.rm=T),"mismatched birth states in the birth places data frame"))
manually add info for people born in Washington D.C (it's all the same GNIS ID, county, etc, but don't want them to look like NAs)
birthplaces_df[bpl_string.x == "District of Columbia", birth_gnis_code := 1702382] birthplaces_df[bpl_string.x == "District of Columbia", birth_region.x := "south"] birthplaces_df[bpl_string.x == "District of Columbia", birth_county := "District of Columbia"] birthplaces_df[bpl_string.x == "District of Columbia", birth_fips := "11001"] birthplaces_df[bpl_string.x == "District of Columbia", birth_city := "District of Columbia"] birthplaces_df[bpl_string.x == "District of Columbia"]
birthplaces_df <- birthplaces_df %>% select(-birth_region.y, -bpl_string.y, -bpl_string.x, -birth_place_matched) %>% rename(birth_region = names(.)[3])
In the chunks of code below, we standardize the data set and make everything lowercase. We also make sure that any differences in wording are standardized.
birthplaces_df$birth_city <- tolower(birthplaces_df$birth_city) birthplaces_df$birth_county <- tolower(birthplaces_df$birth_county) birthplaces_df$death_state <- tolower(birthplaces_df$death_state) birthplaces_df$death_city <- tolower(birthplaces_df$death_city) birthplaces_df$death_county <- tolower(birthplaces_df$death_county) head(birthplaces_df)
birthplaces_df <- birthplaces_df %>% mutate( death_region = ifelse(death_region == "North_East", "northeast", death_region) ) birthplaces_df$death_region <- tolower(birthplaces_df$death_region) birthplaces_df <- birthplaces_df %>% mutate(death_region = str_trim(death_region)) %>% mutate(death_region = ifelse(death_region == "midewest", "midwest", death_region)) table(birthplaces_df$death_region)
standardize county birth and death names (not perfect, but captures some major differences)
birthplaces_df <- birthplaces_df %>% mutate( death_county = str_remove_all(death_county, "county"), #remove the word "county" death_county = str_remove_all(death_county, "parish"), # remove "parish" death_county = str_trim(death_county), birth_county = str_replace_all(birth_county, "\\(ca\\)", "census area"), # ca --> census area birth_county = str_remove_all(birth_county, "\\("), # remove parens birth_county = str_remove_all(birth_county, "\\)"), birth_county = str_trim(birth_county) ) birthplaces_df <- birthplaces_df %>% mutate( death_county = str_remove_all(death_county, "city and borough of"), death_county = str_remove_all(death_county, "city and borough"), death_county = str_trim(death_county)) birthplaces_df <- birthplaces_df %>% mutate(death_county = str_remove_all(death_county, " borough"), #remove the full word "borough" death_county = str_trim(death_county)) birthplaces_df <- birthplaces_df %>% mutate(birth_county = str_remove_all(birth_county, "\\."), # remove periods birth_county = str_trim(birth_county)) birthplaces_df <- birthplaces_df %>% mutate(death_county = str_remove_all(death_county, "\\."), # remove periods death_county = str_trim(death_county)) birthplaces_df <- birthplaces_df %>% mutate(death_county = str_remove_all(death_county, "municipality of"), death_county = str_remove_all(death_county, "municipality"), # death_county = str_trim(death_county)) # some special cases birthplaces_df[death_fips == "17099", death_county := "lasalle"] # LaSalle, IL birthplaces_df[birth_fips == "17099", birth_county := "lasalle"] birthplaces_df[death_fips == "48283", death_county := "la salle"] # La Salle, TX birthplaces_df[birth_fips == "48283", birth_county := "la salle"] birthplaces_df[death_fips == "22059", death_county := "lasalle"] # LaSalle (parish), LA birthplaces_df[birth_fips == "22059", birth_county := "lasalle"] birthplaces_df[birth_county == "doña ana", birth_county := "dona ana"] birthplaces_df[birth_county == "carson city city", birth_county := "carson city"] birthplaces_df[birth_fips == death_fips & birth_county != death_county] %>% group_by(birth_county, death_county) %>% tally() %>% arrange(desc(n)) setdiff(unique(birthplaces_df$death_county), unique(birthplaces_df$birth_county)) setdiff(unique(birthplaces_df$birth_county), unique(birthplaces_df$death_county)) head(birthplaces_df$death_county, 10)
birthplaces_df$death_state <- gsub("\\b\\s+", "", birthplaces_df$death_state) birthplaces_df <- birthplaces_df %>% mutate( death_state = case_when( death_state == "" ~ NA_character_, death_state == "arizona14/ " ~ "arizona", death_state == "newhampshire" ~ "new hampshire", death_state == "southcarolina" ~ "south carolina", death_state == "newjersey" ~ "new jersey", death_state == "newmexico" ~ "new mexico", death_state == "northdakota" ~ "north dakota", death_state == "northcarolina" ~ "north carolina", death_state == "southdakota" ~ "south dakota", death_state == "westvirginia" ~ "west virginia", death_state == "rhodeisland" ~ "rhode island", death_state == "newyork" ~ "new york", TRUE ~ death_state ) ) unique(birthplaces_df$death_state) # standardize washington dc death city string to match death county, birth city, birth county birthplaces_df[death_state == "dc", death_city := "district of columbia"]
Harmonize a few FIPS codes in the death county field that are out of date to align with birth FIPS codes (again not comprehensive but takes care of most discrepancies, mostly Miami-Dade)
# Dade (FL) ---> Miami-Dade birthplaces_df[death_fips == "12025", death_fips := "12086"] birthplaces_df[death_fips == "12086", death_county := "miami-dade"] # Shannon (SD) ---> Oglala Dakota birthplaces_df[death_fips == "46113", death_fips := "46102"] birthplaces_df[death_fips == "46102", death_county := "oglala lakota"] # wade hampton census area (AK) ---> Kusilvak Census Area birthplaces_df[death_fips == "02270", death_fips := "02158"] birthplaces_df[death_fips == "02158", death_county := "kusilvak census area"] setdiff(unique(birthplaces_df$death_county), unique(birthplaces_df$birth_county)) # mostly PR setdiff(unique(birthplaces_df$birth_county), unique(birthplaces_df$death_county))
We also include the "uncleaned" birthplace string for researchers who might want to work with them
birth_city_uncleaned <- fread(unclean_birthplace_stings_path) head(birth_city_uncleaned) nrow(birth_city_uncleaned)
# merge raw strings onto dataset birthplaces_df <- merge(birthplaces_df, birth_city_uncleaned, by = "ssn", all.x = TRUE,sort = FALSE) birthplaces_df <- select(birthplaces_df, -c(V1,bpl_city_oflo)) birthplaces_df <- birthplaces_df %>% rename(birth_city_uncleaned = names(birthplaces_df)[15]) birthplaces_df$birth_city_uncleaned <- tolower(birthplaces_df$birth_city_uncleaned) birthplaces_df <- birthplaces_df %>% relocate(names(birthplaces_df)[15], .before = names(birthplaces_df)[11]) head(birthplaces_df) rm(birth_city_uncleaned) # quick look at unmatched birth strings (foreign birthplaces, incomprehensible strings) birthplaces_df[!is.na(birth_city_uncleaned) & is.na(birth_city)]
Then, we remove all people who have neither a birth or a death city.
# filtering on birth_region removes people who don't have ANY birth *state* info # or are born abroad. filtering on birth_city removes people whose city of birth # was not matched to a valid location. birthplaces_df <- birthplaces_df %>% mutate( deletes = paste(birth_city, death_city) ) %>% filter(!deletes %in% "NA NA") %>% select(-deletes) head(birthplaces_df)
rearrange a bit
birthplaces_df <- birthplaces_df %>% relocate(birth_region, .before = birth_city_uncleaned) %>% relocate(death_fips, .after = death_county)
make NA values consistent across variable types character variables should have blank strings "" as missing values
names(birthplaces_df) str(birthplaces_df) birthplaces_df[is.na(death_state), death_state := ""] birthplaces_df[is.na(death_fips), death_fips := ""] birthplaces_df[is.na(birth_fips), birth_fips := ""] birthplaces_df[is.na(death_city), death_city := ""] birthplaces_df[is.na(death_county), death_county := ""] birthplaces_df[is.na(death_country), death_country := ""] birthplaces_df[is.na(death_region), death_region := ""] birthplaces_df[is.na(birth_city), birth_city := ""] birthplaces_df[is.na(birth_county), birth_county := ""] birthplaces_df[is.na(birth_region), birth_region := ""] birthplaces_df[is.na(birth_city_uncleaned), birth_city_uncleaned := ""] str(birthplaces_df)
Make final BUNMD dataset with columns in selected order and name
bunmd_geo_dataset <- birthplaces_df %>% select(ssn, birth_gnis_code, birth_city, birth_county, birth_fips, birth_region, death_zip, death_city, death_county, death_fips, death_state, death_region, death_country, death_ruc1993) # make sure this looks ok head(bunmd_geo_dataset) str(bunmd_geo_dataset)
Next, we save the file as a csv. This file is usable with the BUNMD
write.csv(bunmd_geo_dataset, paste0(geo_files_out_path, "BUNMD_Geography_File.csv"), row.names = FALSE)
Next, let's read in our HISTID crosswalk and merge it onto the Geography File so we can use it to merge onto the Numident.
HISTID <- fread(ssn_crosswalk_path) birthplaces_df <- fread(paste0(geo_files_out_path, "BUNMD_Geography_File.csv"))
numident_birthplaces_df <- left_join(HISTID, birthplaces_df, by = "ssn") numident_birthplaces_df <- numident_birthplaces_df %>% select(-ssn) numident_birthplaces_df <- numident_birthplaces_df[!is.na(birth_city) & !is.na(death_city)] write.csv(numident_birthplaces_df, paste0(geo_files_out_path, "Numident_Geography_File.csv"), row.names = FALSE)
censoc_numident <- fread(numident_path, select = c("HISTID", "link_abe_exact_conservative")) # limit to conservative links censoc_numident <- censoc_numident[link_abe_exact_conservative == 1] numident_birthplaces_df_conservative <- numident_birthplaces_df[histid %in% censoc_numident$HISTID] # capitalize histid variable numident_birthplaces_df_conservative <- numident_birthplaces_df_conservative %>% rename(HISTID = histid) numident_birthplaces_df_conservative[, birth_fips := str_pad(as.character(birth_fips), width = 5, side = "left", pad = "0")] numident_birthplaces_df_conservative[, death_fips := str_pad(as.character(death_fips), width = 5, side = "left", pad = "0")] numident_birthplaces_df_conservative[is.na(death_fips), death_fips := ""] numident_birthplaces_df_conservative[is.na(birth_fips), birth_fips := ""] # write write.csv(numident_birthplaces_df_conservative, paste0(geo_files_out_path, "numident_geography_file_v2.1_conservative.csv"), row.names = FALSE)
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.