Nothing
#' Get overlap between query and predicted drug combination signatures.
#'
#' Drugs with the largest positive and negative cosine similarity are predicted to,
#' respectively, mimic and reverse the query signature. Values range from +1 to -1.
#'
#' To predict and query all 856086 two-drug cmap combinations, the 'average'
#' \code{method} can take as little as 10 minutes (Intel Core i7-6700). The 'ml'
#' (machine learning) \code{method} takes two hours on the same hardware and
#' requires ~10GB of RAM but is slightly more accurate. Both methods will run
#' faster by specifying only a subset of drugs using the \code{include} parameter.
#' To speed up the 'ml' method, the MRO+MKL distribution of R can help
#' substantially (\href{https://mran.revolutionanalytics.com/open/}{link}).
#' The combinations of LINCS l1000 signatures (~26 billion) can also be queried
#' using the 'average' \code{method}. In order to compare l1000 results to those
#' obtained with cmap, only the same genes should be queried (see example).
#'
#' @import data.table ccdata AnnotationDbi xgboost
#' @importFrom foreach foreach %dopar%
#'
#' @param query_genes Named numeric vector of differentual expression values for
#' query genes. Usually 'meta' slot of \code{get_dprimes} result.
#' @param drug_info Character vector specifying which dataset to query
#' (either 'cmap' or 'l1000'). Can also provide a matrix of differential expression
#' values for drugs or drug combinations (rows are genes, columns are drugs).
#' @param method One of 'average' (default) or 'ml' (machine learning -
#' see details and vignette).
#' @param include Character vector of drug names for which combinations with all
#' other drugs will be predicted and queried. If \code{NULL} (default),
#' all two drug combinations will be predicted and queried.
#' @param ncores Integer, number of cores to use for method 'average'. Default is
#' to use all cores.
#'
#' @return Vector of cosine similarities between query and drug combination signatures.
#' @export
#'
#' @examples
#' library(lydata)
#' library(crossmeta)
#'
#' # location of data
#' data_dir <- system.file("extdata", package = "lydata")
#'
#' # gather GSE names
#' gse_names <- c("GSE9601", "GSE15069", "GSE50841", "GSE34817", "GSE29689")
#'
#' # load previous analysis
#' anals <- load_diff(gse_names, data_dir)
#'
#' # perform meta-analysis
#' es <- es_meta(anals)
#'
#' # get dprimes
#' dprimes <- get_dprimes(es)
#'
#' # query combinations of metformin and all other cmap drugs
#' top_met_combos <- query_combos(dprimes$all$meta, include = 'metformin', ncores = 1)
#'
#' # previous query but with machine learning method
#' # top_met_combos <- query_combos(dprimes$all$meta, method = 'ml', include = 'metformin')
#'
#' # query all cmap drug combinations
#' # top_combos <- query_combos(dprimes$all$meta)
#'
#' # query all cmap drug combinations with machine learning method
#' # top_combos <- query_combos(dprimes$all$meta, method = 'ml')
#'
#' # query l1000 and cmap using same genes
#' # library(ccdata)
#' # data(cmap_es)
#' # data(l1000_es)
#' # cmap_es <- cmap_es[row.names(l1000_es), ]
#'
#' # met_cmap <- query_combos(dprimes$all$meta, cmap_es, include = 'metformin')
#' # met_l1000 <- query_combos(dprimes$all$meta, l1000_es, include = 'metformin')
query_combos <- function(query_genes, drug_info = c('cmap', 'l1000'), method = c('average', 'ml'), include = NULL, ncores=parallel::detectCores()) {
if (method[1] == 'ml' & drug_info[1] != 'cmap') {
stop("Machine learning method only available for drug_info = 'cmap'.")
}
if (is.character(drug_info)) {
# default to cmap_es for drug_info
fname <- paste0(drug_info[1], '_es')
utils::data(list = fname, package = "ccdata", envir = environment())
drug_info <- get(fname)
rm(list = fname)
}
# sending l1000_es to 8 cores requires ~30G free RAM
if (ncol(drug_info) > 10000)
ncores <- max(round(ncores/2), 1)
drugs <- colnames(drug_info)
cpds <- gsub('_.+?$', '', drugs)
# check 'include
if (FALSE %in% (include %in% drugs | include %in% cpds)) {
message("Drugs in 'include' not found.")
return(NULL)
}
if (!is.null(include)) {
# either specified dose_time or all cpds
include <- drugs[(cpds %in% include|drugs %in% include)]
} else {
# query all combinations if include is NULL
include <- drugs
}
# use average model
if (method[1] != "ml") {
return(query_combos_average(query_genes, drug_info, include, ncores))
# use machine learning 'ml' model
} else {
dat <- load_ccdata(drug_info)
exclude <- c()
res <- c()
i <- 1
pb <- utils::txtProgressBar(min=0, max=length(include), style=3)
for (drug in include) {
combos_es <- predict_combos(drug, exclude, dat)
if (is.null(combos_es)) break
res <- c(res, query_drugs(query_genes, combos_es, sorted = FALSE))
exclude <- c(exclude, drug)
utils::setTxtProgressBar(pb, i)
i <- i + 1
}
return(sort(res, decreasing = TRUE))
}
}
#---------------
# Query drug combinations using average model.
#
# @param query_genes Named numeric vector of differentual expression values for
# query genes. Usually 'meta' slot of \code{get_dprimes} result.
# @param cmap_es Matrix with cmap dprime values.
# @param include Character vector of cmap drug names for which combinations with all
# other cmap drugs will be predicted and queried. If \code{NULL} (default),
# all 856086 two drug combinations will be predicted and queried.
#
# @return Vector of numeric values between 1 and -1 indicating extent of overlap
# between query and drug combination signatures.
query_combos_average <- function(query_genes, drug_info, include, ncores) {
i = NULL # bind global variable
cl <- parallel::makeCluster(ncores)
doParallel::registerDoParallel(cl)
drugs <- colnames(drug_info)
bins <- get_bins(length(include), ncores)
resl <- foreach::foreach(i=1:min(ncores, length(bins))) %dopar% {
bin <- bins[[i]]
#add to exclude list (for all but first bin)
if (i != 1) {
exclude <- drugs[1:(bin[1]-1)]
} else {
exclude <- c()
}
res <- c()
for (drug in include[bin]) {
# average drug and all other drugs (except excluded)
other_drugs <- setdiff(drugs, c(exclude, drug))
if (length(other_drugs) == 0) break
drug_es <- drug_info[, drug]
# break up task up to reduce memory footprint
dbins <- get_bins(length(other_drugs), 10)
for (dbin in dbins) {
combo_es <- drug_info[, other_drugs[dbin], drop = FALSE]
combo_es <- sweep(combo_es, 1, drug_es, `+`)/2
# update colnames to reflect combo
colnames(combo_es) <- paste(drug, colnames(combo_es), sep = " + ")
# query combo_es
res <- c(res, query_drugs(query_genes, combo_es, FALSE))
}
# add drug to excluded list
exclude <- c(exclude, drug)
}
res
}
parallel::stopCluster(cl)
return(sort(unlist(resl), decreasing = TRUE))
}
#---------------
# Load data for machine learning method of predict_combos.
#
# @param cmap_es Matrix with cmap dprime values. If NULL, will be loaded.
#
# @return List with objects need for 'ml' method of predict_combos
load_ccdata <- function(cmap_es = NULL) {
#bind global variables
cmap_var = net1 = net2 = genes = xgb_mod = NULL
if(is.null(cmap_es))
utils::data("cmap_es", package = "ccdata", envir = environment())
# load everything
utils::data("cmap_var", "net1", "net2", "genes", "xgb_mod",
package = "ccdata", envir = environment())
# keep/order genes as for nnet
cmap_es <- t(cmap_es[genes, ])
cmap_var <- cmap_var[genes, ]
return(list(es=cmap_es, var=cmap_var, net1=net1, net2=net2,
genes=genes, xgb_mod=xgb_mod))
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.