Summary: This code tracks the total number of CenSoc users. Specifically, it reads in a log of all people who have signed up to download CenSoc data. Next, it creates parses download-log files from the website.

## library packages
library(tidyverse)
library(here)
library(lubridate)
library(ApacheLogProcessor)
library(stringr)
library(cowplot)
library(DBI)
library(RSQLite)

Download Sign-up Logs

## censoc download sqlite DBI
path.to.db <- "/admin/lab_writeable/Censoc1313/Sqdata/crusers.db"  # not renamed from Creles project

## create temp file 
tmp.db <- tempfile()

# user needs to own file or something, otherwise it is locked. So make a copy
system( paste0("cp -a ",  path.to.db, " ", tmp.db ) )

  db <- dbConnect(SQLite(), tmp.db ,  flags = SQLITE_RO, synchronous = NULL)

  dbListTables(db)
  signup_log <- dbGetQuery(db, "SELECT * from users LIMIT 1000000")  # records are transactions, not unique users
  dbGetQuery(db, "SELECT COUNT(*) from users")

  # DBI returns data.frames
  str( dbGetQuery(db, "SELECT * from users LIMIT 10") )

  dbDisconnect(db)

## Filter to dates after our first release (April 1st, 2020)
users <- signup_log %>% 
  mutate(date = as_date(dateh)) %>% 
  filter(date > "2020-04-01") %>% 
  arrange(desc(date))

## New users
users <- users %>% 
  mutate(fname = tools::toTitleCase(tolower(fname)),
         lname = tools::toTitleCase(tolower(lname))) %>% 
  group_by(fname, lname) %>% 
  arrange(date) %>% 
  filter(row_number() == 1)

Visualize new sign-ups

## number of new users by month
users %>% 
  group_by(month = lubridate::floor_date(date, "month")) %>%
  tally()

## number of cumulative (new) users
total_censoc_users <- users %>% 
  group_by(date) %>% 
  tally() %>% 
  mutate(source = "New Signup",
         total = cumsum(n)) %>% 
  ggplot() + 
  geom_line(aes(x = date, y = total), size = 1, color = "black") + 
  theme_cowplot() + 
  labs(y = "Users",
       x = "Date",
       title = "Total CenSoc Users (Unique)")

## plot cumultative number of users 
ggsave(plot = total_censoc_users, filename = here("vignettes/user_logs/figs/total_censoc_users.png"), height = 4, width = 6)

Parse String Logs from Website

The code below parses string logs from the CenSoc data download page to figure out which datasets a user downloaded.

## Parse Strings 
censocLogs <- read.multiple.apache.access.log("/admin/lab_writeable/Censoc1313/Weblogs/",
prefix="censoc-access.log")

## Restrict to data 
## Note: Only getting activity from 2023. Needs to figure out how to get all downloads
download_logs <- censocLogs %>% 
  filter(httpcode == 200) %>% 
  filter(str_detect(url, pattern = "GET /Data/")) 

## scrape url to figure out which dataset was downloaded 
download_logs <- download_logs %>% 
  mutate(dataset_downloaded = case_when(
    str_detect(url, pattern = "bunmd_v1") == TRUE ~ "BUNMD",
    str_detect(url, pattern = "bunmd_v2") == TRUE ~ "BUNMD",
    str_detect(url, pattern = "numident_v1") == TRUE ~ "CenSoc-Numident",
    str_detect(url, pattern = "numident_v2") == TRUE ~ "CenSoc-Numident",
    str_detect(url, pattern = "numident_demo") == TRUE ~ "CenSoc-Numident Demo",
    str_detect(url, pattern = "dmf_v1") == TRUE ~ "CenSoc-DMF",
    str_detect(url, pattern = "dmf_v2") == TRUE ~ "CenSoc-DMF",
    str_detect(url, pattern = "dmf_demo") == TRUE ~ "CenSoc-DMF Demo",
    str_detect(url, pattern = "numident_enlistment") == TRUE ~ "CenSoc-Numident WWII Enlistment",
    str_detect(url, pattern = "dmf_enlistment") == TRUE ~ "CenSoc-DMF WWII Enlistment",  
    TRUE ~ NA_character_
  )) %>% 
  mutate(url2 = url) %>% 
  separate(url, c(NA, NA, "link"), sep = "/") %>% 
  mutate(link = tolower(link))

Create a list of all downloads

## all downloads 
signup_log <- signup_log %>% 
  mutate(link = tolower(link))  

## combine all signups with all download logs
censoc_downloaders <- download_logs %>% 
  left_join(signup_log) 

## Create long dataset
censoc_downloaders_long <- censoc_downloaders %>% 
  group_by(fname, lname, dataset_downloaded) %>% 
  filter(row_number() == 1) %>% 
  dplyr::select(fname, lname, email, dataset_downloaded, dateh) %>% 
  ungroup() %>% 
  filter(!is.na(dataset_downloaded)) 

summary(as_date(censoc_downloaders_long$dateh)) # Only 2023

## visualize downloads 
downloads_by_dataset <- censoc_downloaders_long %>% 
  filter(dateh >= as_date("2022-01-01")) %>% 
  filter(!is.na(dataset_downloaded)) %>% 
  group_by(dataset_downloaded) %>% 
  tally() %>% 
  ggplot() + 
  geom_col(aes(x = reorder(dataset_downloaded, n), y = n), color = "black", fill = "grey") + 
  geom_text(aes(x = reorder(dataset_downloaded, n), y = n + 3, label = n), color = "black") + 
  coord_flip() + 
  labs(x = "",                                                     
       y = "Downloads",
       title = "Downloads by Dataset (January - October 2023)") + 
  theme_cowplot()

## save file 
ggsave(plot = downloads_by_dataset, filename = here("vignettes/user_logs/figs/downloads_by_dataset.png"), height = 5, width = 9)

Match users onto the dataset they downloaded

## Create long dataset 
censoc_downloaders_wide <- censoc_downloaders_long %>% 
  mutate(download = 1) %>% 
  pivot_wider(id_cols = c(fname, lname, email),
              names_from = dataset_downloaded, 
              values_from = download)

## combine with original user file
users <- users %>% 
  left_join(censoc_downloaders_wide %>% 
              select(-fname, -lname), by = ("email")) 

Save signup log

TODO: Get all downloads prior to 2023

write_csv(signup_log %>% mutate(date_obj = as.Date(substring(dateh, 1, 10))),
          "/data/censoc/miscellaneous/user_logs/old_download_page_signups.csv")


caseybreen/wcensoc documentation built on Nov. 21, 2024, 5:15 a.m.