knitr::opts_chunk$set(message = F, eval = F)

Setup and Requirements

# Settings
prefix     <- 'db/SPA-YYYY-MM-DD' # prepended to file name
replicates <- 4                   # number of replicates of each strain
density    <- 1536                # plate density
cm         <- system.file('examples/cm.txt', package = 'screenmill')
dr         <- system.file('examples/dr.txt', package = 'screenmill')
dr_control <- system.file('examples/control.txt', package = 'screenmill')
screens    <- system.file('examples/screens.csv', package = 'screenmill')
plates     <- system.file('examples/plates.csv', package = 'screenmill')
# Required packages
library(DT)
library(screenmill)
library(rothfreezer)
library(dplyr)
# Read in all required data
measurements <- screenmill::read_cm(cm, replicates = replicates)
metadata     <- screenmill::read_metadata(screens, plates)
exclusions   <- bind_rows(screenmill::read_dr(dr), screenmill::read_dr(dr_control))
# Strain annotations are available in the rothfreezer package
db           <- rothfreezer::src_rothfreezer()
strains      <- db %>% tbl('strains') %>% select(strain_id, strain_name) %>% collect
collection   <- db %>% tbl('strain_collections') %>% collect
# Unique screens
screens <-  
  metadata %>% 
  select(screen_id:media_id, temperature:screen_notes) %>% 
  distinct

Raw data processing

# Annotate measurements with node/edge IDs, incubation time, and exlusion data
raw_colony_sizes <-
  measurements %>% 
  left_join(metadata) %>%
  left_join(exclusions) %>%
  left_join(collection) %>%
  left_join(strains) %>%
  mutate(
    # Ensure proper variable types
    excluded_query = as.logical(excluded_query),
    plate_control  = as.logical(plate_control),
    size           = as.numeric(size),
    incubation     = as.numeric(incubation),
    incubation_start = as.character(incubation_start),
    incubation_end   = as.character(incubation_end),
    row_numb       = as.numeric(factor(row)), # Convert row letters to numbers
    # Compute colony positions
    nrep = max(replicate),  # number of replicates
    colony_row = ((row_numb - 1) * sqrt(nrep)) + ceiling(replicate / sqrt(nrep)),
    colony_col = ((column   - 1) * sqrt(nrep)) + (replicate - 1 + nrep) %% sqrt(nrep) + 1
  ) %>%
  select(
    # Identification
    screen_id, control_screen_id, strain_id, strain_name, query_id, query_name, 
    plate, row, column, colony_row, colony_col, replicate,
    # Measurements
    size_raw = size, size_dr, circ,
    # Incubation time
    timepoint, incubation, incubation_start, incubation_end,
    # Exclusions and controls
    excluded_query, excluded_control, plate_control
  )

Normalization

Exclusions

exclusions_marked <-
  raw_colony_sizes %>%
  mutate(
    control = (screen_id == control_screen_id),
    # Mark the following excluded observations as NA
    size = ifelse(
      excluded_query |         # excluded in data review
      strain_name == 'blank' | # blank strains
      # slow growing strains (less than 25% growth of control screen median)
      (control & size_raw < 0.25 * median(size_raw[control], na.rm = T)) |
      # MATalpha library has his border, so exclude edges
      (row    %in% c(min(row), max(row))) |
      (column %in% c(min(column), max(column))),
      NA, size_raw)
  ) %>%
  select(-control)

Edge scaling

edge_adjusted <-
  # After marking exclusions
  exclusions_marked %>%
  # For each plate at a given timepoint
  group_by(screen_id, timepoint, plate) %>%
  # Adjust the colony size such that
  mutate(
    # Outer edges
    edge1 = colony_col %in% c(1, max(colony_col)) | 
            colony_row %in% c(1, max(colony_row)),
    edge2 = colony_col %in% c(2, max(colony_col) - 1) | 
            colony_row %in% c(2, max(colony_row) - 1),
    edge  = edge1 | edge2,
    # Are scaled to the median of non-edge colony sizes
    size = 
      ifelse(
        edge1, 
        size * (median(size[!edge], na.rm = T) / median(size[edge1], na.rm = T)),
      ifelse(
        edge2,
        size * (median(size[!edge], na.rm = T) / median(size[edge2], na.rm = T)),
      size))
  ) %>%
  select(-starts_with('edge'))

Plate control normalization

plate_adjusted <-
  # After adjusting edges
  edge_adjusted %>%
  # For each plate at a given timepoint
  group_by(screen_id, timepoint, plate) %>%
  # Calculate median of plate controls
  mutate(plate_median = median(size[plate_control], na.rm = T)) %>%
  # For each screen at a given timepoint
  group_by(screen_id, timepoint) %>%
  # Adjust the colony size such that the plate median is scaled to screen median
  mutate(
    screen_median = median(size[plate_control], na.rm = T),
    size = size * (screen_median / plate_median)
  ) %>%
  select(-screen_median, -plate_median)

Position normalization

position_adjusted <-
  # After plate adjustment
  plate_adjusted %>%
  # For each screen at a given timepoint
  group_by(screen_id, timepoint) %>%
  # Re-calculate screen median
  mutate(screen_median = median(size[plate_control], na.rm = T)) %>%
  # For each plate at a given timepoint
  group_by(screen_id, timepoint, plate) %>%
  # Adjust colony size to remove spatial effect
  mutate(
    spatial_effect = screenmill::spatial_effect(colony_row, colony_col, size),
    size = size * (screen_median / spatial_effect)
  ) %>%
  select(-screen_median)

Final normalization

normalized <-
  # After all adjustments
  position_adjusted %>%
  # For each screen at a given timepoint
  group_by(screen_id, timepoint) %>%
  # Place lower limit on size
  mutate(
    size = ifelse(size < 0.01, 0.01, size),
    screen_median = median(size[plate_control], na.rm = T),
    screen_sd     = sd(size[plate_control], na.rm = T)
  ) %>%
  ungroup

Interaction scores

# Select control data
control <-
  normalized %>%
  filter(screen_id == control_screen_id) %>%
  select(
    control_screen_id, timepoint, strain_id, strain_name, 
    plate, row, column, replicate, colony_row, colony_col, 
    size_control    = size, 
    size_control_wt = screen_median, 
    sd_control_wt   = screen_sd
  )

# Select query data
queries <-
  normalized %>%
  filter(screen_id != control_screen_id) %>%
  select(
    screen_id, timepoint, control_screen_id, strain_id:replicate, 
    colony_row, colony_col, 
    size_query    = size, 
    size_query_wt = screen_median, 
    sd_query_wt   = screen_sd
  )

scores <-
  left_join(queries, control) %>%
  # Group by strain to agregate replicates
  group_by(
    screen_id, timepoint, strain_id, strain_name, query_id, query_name, 
    plate, row, column
  ) %>%
  summarise(
    size_query_wt   = mean(size_query_wt),
    size_control_wt = mean(size_control_wt),
    n_query         = length(na.omit(size_query)),
    n_control       = length(na.omit(size_control)),
    size_query      = mean(size_query, na.rm = T),
    size_control    = mean(size_control, na.rm = T)
  ) %>% ungroup %>%
  mutate(
    # Calculate fitness estimates.
    Fi    = size_query_wt / size_control_wt,
    Fj    = size_control  / size_control_wt,
    Fij   = size_query    / size_control_wt,
    Eij   = Fi * Fj,
    Elogr = log2(Fij / Eij), # Centers on 0
    Ediff = (Fij - Eij)      # Centers on 0
  ) %>%
  # Calculate Z-score based on SD and mean of each screen at a given timepoint
  group_by(screen_id, timepoint) %>%
  mutate(
    Zlogr = (Elogr - mean(Elogr, na.rm = T)) / sd(Elogr,  na.rm = T),
    Zdiff = (Ediff - mean(Ediff, na.rm = T)) / sd(Ediff,  na.rm = T)
  ) %>%
  ungroup %>%
  filter(complete.cases(.))

Final Review

Screens

screens %>%
  select(
    screen_id, query_id, query_name, strain_collection_id, method_id, 
    media_id, temperature, screen_description
  ) %>%
  datatable

Tests

# How many incomplete cases are there?
if (nrow(raw_colony_sizes) - nrow(na.omit(raw_colony_sizes))) {
  warning(paste0(
    'NA values were present after annotating measurements. ',
    'Check the CM engine, plates, and screens files for missing data.'))
} else {
  message('Annotated measurements are not missing any data.')
}

# Is there a difference in the number of measurements and the final dataset?
if (nrow(measurements) - nrow(raw_colony_sizes)) {
  warning(paste0(
    'The number of measurements does not match the number of annotated measurements. ',
    'Check for proper measurement annotation.'))
} else {
  message('Measurements have uniquely mapped to annotations.')
}

# Are plates uniquely grouped by screen_id, timepoint, and plate variables
observed_density <- count(normalized, screen_id, timepoint, plate)
if (any(observed_density$n != density)) {
  warning('The number of colonies on each plate does not match specified density.')
  observed_density
} else {
  message('Plate groupings match specified plate density.')
}

Write to file

normalized %>% write.csv(paste0(prefix, '-measurements.csv'), row.names = FALSE)
scores     %>% write.csv(paste0(prefix, '-scores.csv'), row.names = FALSE)
screens    %>% write.csv(paste0(prefix, '-screens.csv'), row.names = FALSE)

Session Info

options(width = 85)
devtools::session_info()


EricEdwardBryant/screenmill documentation built on March 13, 2020, 1:07 p.m.