Summary: This code tracks the total number of CenSoc users. Specifically, it reads in a log of all people who have signed up to download CenSoc data. Next, it creates parses download-log files from the website.
## library packages library(tidyverse) library(here) library(lubridate) library(ApacheLogProcessor) library(stringr) library(cowplot) library(DBI) library(RSQLite)
## censoc download sqlite DBI path.to.db <- "/admin/lab_writeable/Censoc1313/Sqdata/crusers.db" # not renamed from Creles project ## create temp file tmp.db <- tempfile() # user needs to own file or something, otherwise it is locked. So make a copy system( paste0("cp -a ", path.to.db, " ", tmp.db ) ) db <- dbConnect(SQLite(), tmp.db , flags = SQLITE_RO, synchronous = NULL) dbListTables(db) signup_log <- dbGetQuery(db, "SELECT * from users LIMIT 1000000") # records are transactions, not unique users dbGetQuery(db, "SELECT COUNT(*) from users") # DBI returns data.frames str( dbGetQuery(db, "SELECT * from users LIMIT 10") ) dbDisconnect(db) ## Filter to dates after our first release (April 1st, 2020) users <- signup_log %>% mutate(date = as_date(dateh)) %>% filter(date > "2020-04-01") %>% arrange(desc(date)) ## New users users <- users %>% mutate(fname = tools::toTitleCase(tolower(fname)), lname = tools::toTitleCase(tolower(lname))) %>% group_by(fname, lname) %>% arrange(date) %>% filter(row_number() == 1)
## number of new users by month users %>% group_by(month = lubridate::floor_date(date, "month")) %>% tally() ## number of cumulative (new) users total_censoc_users <- users %>% group_by(date) %>% tally() %>% mutate(source = "New Signup", total = cumsum(n)) %>% ggplot() + geom_line(aes(x = date, y = total), size = 1, color = "black") + theme_cowplot() + labs(y = "Users", x = "Date", title = "Total CenSoc Users (Unique)") ## plot cumultative number of users ggsave(plot = total_censoc_users, filename = here("vignettes/user_logs/figs/total_censoc_users.png"), height = 4, width = 6)
The code below parses string logs from the CenSoc data download page to figure out which datasets a user downloaded.
## Parse Strings censocLogs <- read.multiple.apache.access.log("/admin/lab_writeable/Censoc1313/Weblogs/", prefix="censoc-access.log") ## Restrict to data ## Note: Only getting activity from 2023. Needs to figure out how to get all downloads download_logs <- censocLogs %>% filter(httpcode == 200) %>% filter(str_detect(url, pattern = "GET /Data/")) ## scrape url to figure out which dataset was downloaded download_logs <- download_logs %>% mutate(dataset_downloaded = case_when( str_detect(url, pattern = "bunmd_v1") == TRUE ~ "BUNMD", str_detect(url, pattern = "bunmd_v2") == TRUE ~ "BUNMD", str_detect(url, pattern = "numident_v1") == TRUE ~ "CenSoc-Numident", str_detect(url, pattern = "numident_v2") == TRUE ~ "CenSoc-Numident", str_detect(url, pattern = "numident_demo") == TRUE ~ "CenSoc-Numident Demo", str_detect(url, pattern = "dmf_v1") == TRUE ~ "CenSoc-DMF", str_detect(url, pattern = "dmf_v2") == TRUE ~ "CenSoc-DMF", str_detect(url, pattern = "dmf_demo") == TRUE ~ "CenSoc-DMF Demo", str_detect(url, pattern = "numident_enlistment") == TRUE ~ "CenSoc-Numident WWII Enlistment", str_detect(url, pattern = "dmf_enlistment") == TRUE ~ "CenSoc-DMF WWII Enlistment", TRUE ~ NA_character_ )) %>% mutate(url2 = url) %>% separate(url, c(NA, NA, "link"), sep = "/") %>% mutate(link = tolower(link))
## all downloads signup_log <- signup_log %>% mutate(link = tolower(link)) ## combine all signups with all download logs censoc_downloaders <- download_logs %>% left_join(signup_log) ## Create long dataset censoc_downloaders_long <- censoc_downloaders %>% group_by(fname, lname, dataset_downloaded) %>% filter(row_number() == 1) %>% dplyr::select(fname, lname, email, dataset_downloaded, dateh) %>% ungroup() %>% filter(!is.na(dataset_downloaded)) summary(as_date(censoc_downloaders_long$dateh)) # Only 2023 ## visualize downloads downloads_by_dataset <- censoc_downloaders_long %>% filter(dateh >= as_date("2022-01-01")) %>% filter(!is.na(dataset_downloaded)) %>% group_by(dataset_downloaded) %>% tally() %>% ggplot() + geom_col(aes(x = reorder(dataset_downloaded, n), y = n), color = "black", fill = "grey") + geom_text(aes(x = reorder(dataset_downloaded, n), y = n + 3, label = n), color = "black") + coord_flip() + labs(x = "", y = "Downloads", title = "Downloads by Dataset (January - October 2023)") + theme_cowplot() ## save file ggsave(plot = downloads_by_dataset, filename = here("vignettes/user_logs/figs/downloads_by_dataset.png"), height = 5, width = 9)
## Create long dataset censoc_downloaders_wide <- censoc_downloaders_long %>% mutate(download = 1) %>% pivot_wider(id_cols = c(fname, lname, email), names_from = dataset_downloaded, values_from = download) ## combine with original user file users <- users %>% left_join(censoc_downloaders_wide %>% select(-fname, -lname), by = ("email"))
TODO: Get all downloads prior to 2023
write_csv(signup_log %>% mutate(date_obj = as.Date(substring(dateh, 1, 10))), "/data/censoc/miscellaneous/user_logs/old_download_page_signups.csv")
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.