This file contains step-by-step instructions on how to obtain the data and deconvolute phenotypes from pathogen infection screens, as shown in gespeR: A statistical model for deconvoluting off-target-confounded RNA interference screens (Schmich et al., 2015). For full compatibility, please download the development version of gespeR at: https://github.com/fschmich/gespeR
sapply(c("QIAGEN", "DHARMACON", "VALIDATION"), function(x) { download.file(url = sprintf("http://n.ethz.ch/~fschmich/gespeR/%s.rds", x), destfile = sprintf("/tmp/%s.rds", x), quiet = FALSE, mode = "wb") })
require(Matrix) require(gespeR) # Construct TargetRelations objects for each library Q <- TargetRelations("/tmp/QIAGEN.rds") show(Q)
require(dplyr) require(reshape2) require(gespeR) # Read phenotypes phenotypes <- read.delim("/tmp/phenotypes.csv", sep = ",", stringsAsFactors = FALSE) %>% tbl_df() %>% select(SID, GeneID = NCBI.Gene.ID, siRNASet = SIRNA_SET, contains("Infectivity")) %>% melt(id.vars = c("SID", "siRNASet", "GeneID")) %>% tbl_df() %>% select(SID = SID, GeneID, siRNASet, Pathogen = variable, Phenotype = value) %>% mutate(Pathogen = gsub("Infectivity_", "", Pathogen), SID = as.character(SID)) %>% filter(!is.na(Phenotype)) %>% # Artifact of how data is deposited in Pubchem arrange(SID) head(phenotypes) # Read ID mapping between SIDs and Vendor IDs map <- read.delim("/tmp/mapping.txt", header = FALSE, stringsAsFactors = FALSE) map <- map[seq(2, nrow(map), by = 2),] map <- data.frame(t(sapply(map, function(x) { unlist(strsplit(x, split = "SID: | InfectX Consortium: "))[2:3] })), stringsAsFactors = FALSE) %>% tbl_df() rownames(map) <- NULL colnames(map) <- c("SID", "VendorID") # Map IDs phenotypes <- left_join(phenotypes, map, by = "SID") %>% tbl_df() %>% select(SID, VendorID, siRNASet, GeneID, Pathogen, Phenotype) head(phenotypes) # Construct Phenotypes objects for each (Qiagen) library + pathogen combination obs.ssp <- obs.gsp <- list() phenotypes <- split(phenotypes, phenotypes$Pathogen) for (pathogen in names(phenotypes)) { obs.ssp[[pathogen]] <- obs.gsp[[pathogen]] <- list() for (s in 1:4) { spl <- filter(phenotypes[[pathogen]], siRNASet == s, VendorID %in% Q@siRNAs) obs.ssp[[pathogen]][[s]] <- Phenotypes(phenotypes = Matrix(spl$Phenotype), ids = spl$VendorID, pnames = c("Infectivity"), type = "SSP") spl.noNA <- filter(spl, !is.na(GeneID)) %>% group_by(GeneID) %>% summarise(Phenotype = mean(Phenotype, na.rm = TRUE)) %>% filter(!is.nan(Phenotype)) # We need gene-based Phenotypes objects for concordance evaluation obs.gsp[[pathogen]][[s]] <- Phenotypes(phenotypes = Matrix(spl.noNA$Phenotype), ids = as.character(spl.noNA$GeneID), pnames = c("Infectivity"), type = "GSP") } } show(obs.ssp$Bartonella[[1]])
require(gespeR) # Fit gespeR models ans.cv <- list() for (pathogen in c("Bartonella", "Brucella", "Salmonella")) { ans.cv[[pathogen]] <- list() for (s in 1:4) { cat(sprintf("set: %d, pathogen: %s\n", s, pathogen)) ges <- gespeR(phenotypes = obs.ssp[[pathogen]][[s]], target.relations = Q, mode = "cv", alpha = 0.5, ncores = 1) ans.cv[[pathogen]][[s]] <- unloadValues(ges, writeValues = FALSE) } }
# Obtain gene-specific phenotypes (GSPs) ges.gsp <- lapply(ans.cv, function(x) { lapply(x, gsp) })
ges.gsp <- readRDS("~/gespeR_GSPs.rds")
require(gespeR) require(ggplot2) # Function computes concordance between all pairs of phenotypes. Measures used # are Spearman's correlation, rank-biased overlap and the Jaccard index. # Observed phenotypes are cut to the same length as gespeR GSPs, respecting the # proportion of negative and positive phenotypes, in order to guarantee fair # comparison. get.conc <- function(phen, cut = NULL) { min.overlap = 10 rbo.k = 1000 rbo.p = 1-1e-3 rbo.mid <- 0 cor.method = "spearman" uneven.lengths = TRUE if (!is.null(cut)) { # cut longer ranked lists to gespeR's lengths lapply(names(phen), function(x) { l <- lapply(cut[[x]], function(z) { ans <- as.data.frame(z) list(pos = length(which(ans$Infectivity > 0)), neg = length(which(ans$Infectivity < 0))) }) phencut <- lapply(phen[[x]], function(y) { as.data.frame(y) %>% tbl_df() %>% mutate(ID = as.character(ID)) %>% filter(!is.na(Infectivity)) %>% arrange(desc(Infectivity)) }) for (lib in 1:length(l)) { len <- l[[lib]] a <- nrow(phencut[[lib]]) - len$neg + 1 b <- nrow(phencut[[lib]]) phencut[[lib]] <- phencut[[lib]][c(1:len$pos, a:b),] phencut[[lib]] <- Phenotypes(phenotypes = Matrix(phencut[[lib]]$Infectivity), ids = phencut[[lib]]$ID, pnames = c("Infectivity"), type = "SSP") } concordance(phencut, min.overlap = min.overlap, rbo.k = rbo.k, rbo.p = rbo.p, cor.method = cor.method, rbo.mid = rbo.mid, uneven.lengths = uneven.lengths) %>% data.frame %>% select(-lisect) %>% melt(id.vars = c("test.pair", "phen")) %>% mutate(Method = "SSP", Pathogen = x) %>% select(-test.pair, Method, Pathogen, Measure = variable, value) %>% tbl_df() }) %>% do.call("rbind", .) } else { lapply(names(phen), function(x) { concordance(phen[[x]], min.overlap = min.overlap, rbo.k = rbo.k, rbo.p = rbo.p, cor.method = cor.method, rbo.mid = rbo.mid, uneven.lengths = uneven.lengths) %>% data.frame %>% select(-lisect) %>% melt(id.vars = c("test.pair", "phen")) %>% mutate(Method = "gespeR", Pathogen = x) %>% select(-test.pair, Method, Pathogen, Measure = variable, value) %>% tbl_df() }) %>% do.call("rbind", .) } } # Computation of concordance for gespeR GSPs and observed phenotypes conc.gespeR <- get.conc(ges.gsp) conc.obs <- get.conc(obs.gsp, cut = ges.gsp) # Visualisation of concordance measures dat <- rbind(conc.gespeR, conc.obs) %>% tbl_df() %>% mutate(Pathogen = factor(Pathogen, levels = c("Brucella", "Bartonella", "Salmonella"), labels = c("B. abortus", "B. henselae", "S. typhimurium")), Method = factor(Method, levels = c("gespeR", "SSP"), labels = c("gespeR", "Observed"))) dat$Measure <- factor(dat$Measure, levels = c("cor", "rbo.top", "rbo.bottom", "jaccard"), labels = c(expression(rho), expression(rbo["" %down% ""]), expression(rbo["" %up% ""]), expression("J"))) ggplot(data = dat, aes(x = Pathogen, y = value, colour = Method)) + geom_boxplot(outlier.size = 0, width = 0.8) + facet_grid(. ~ Measure, labeller = label_parsed) + xlab("") + ylab("") + scale_colour_manual("", values = c("#d7191c", "#525252"), drop = FALSE) + ylim(c(0, 1)) + theme_bw(base_size = 12, base_family = "Helvetica") + theme(axis.text = element_text(size = rel(1.0)), axis.title = element_text(size = rel(1.0), face = "bold"), strip.text = element_text(size = rel(1.0), face = "bold"), axis.ticks = element_line(colour = "black"), legend.key = element_rect(colour = NA), legend.text = element_text(size = rel(1.0)), legend.title = element_text(size = rel(1.0), face = "bold"), panel.background = element_rect(fill = "white", colour = NA), panel.border = element_rect(fill = NA, colour = "grey50"), panel.grid.major = element_line(colour = "grey90", size = 0.2), panel.grid.minor = element_line(colour = "grey98", size = 0.5), strip.background = element_blank(), axis.text.x = element_text(angle = 45, hjust = 1))
sessionInfo()
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.