R/drugSensitivity.R

Defines functions plotTargetingDrug predictTargetingDrugs loadExpressionDrugSensitivityAssociation `[.expressionDrugSensitivityAssociation` dim.expressionDrugSensitivityAssociation dimnames.expressionDrugSensitivityAssociation readExpressionDrugSensitivityCorHDF5 listExpressionDrugSensitivityAssociation writeExpressionDrugSensitivityCorHDF5 prepareExpressionDrugSensitivityAssociation correlateGEandDrugSensitivity loadGDSC7drugSensitivity loadGDSC7geneExpression loadGDSC7compoundInfo loadGDSC7cellLineInfo loadGDSC7file loadNCI60drugSensitivity loadNCI60geneExpression loadCTRPcompoundInfo loadCTRPdrugSensitivity loadCTRPgeneExpression

Documented in dim.expressionDrugSensitivityAssociation dimnames.expressionDrugSensitivityAssociation listExpressionDrugSensitivityAssociation loadCTRPcompoundInfo loadCTRPdrugSensitivity loadCTRPgeneExpression loadExpressionDrugSensitivityAssociation loadGDSC7cellLineInfo loadGDSC7compoundInfo loadGDSC7drugSensitivity loadGDSC7file loadGDSC7geneExpression loadNCI60drugSensitivity loadNCI60geneExpression predictTargetingDrugs prepareExpressionDrugSensitivityAssociation

# Data loading from CTRP -------------------------------------------------------

#' Load CTRP data
#'
#' If given paths direct to non-existing files, those files will be downloaded
#'
#' @param geneExpressionFile Character: path to file with gene expression
#' @param geneInfoFile Character: path to file with gene information
#' @param cellLineInfoFile Character: path to file with cell line information
#'
#' @importFrom data.table fread
#' @importFrom reshape2 dcast
#' @importFrom stats setNames
#' @importFrom R.utils renameFile
#'
#' @return Data frame
#' @keywords internal
loadCTRPgeneExpression <- function(
    geneExpressionFile="CTRP 2.1/geneExpr.txt",
    geneInfoFile="CTRP 2.1/geneInfo.txt",
    cellLineInfoFile="CTRP 2.1/cellLineInfo.txt") {

    if (!any(file.exists(geneExpressionFile, geneInfoFile))) {
        url <- file.path("ftp://caftpd.nci.nih.gov/pub/OCG-DCC/CTD2/Broad",
                         "CTRPv2.1_2016_pub_NatChemBiol_12_109",
                         "CTRPv2.1_2016_pub_NatChemBiol_12_109.zip")
        folder <- dirname(geneExpressionFile)
        toExtract <- c("v21.data.gex_avg_log2.txt",
                       "v21.meta.gex_features.txt",
                       "v21.meta.per_cell_line.txt")
        downloadIfNotFound(url, file.path(folder, basename(url)),
                           toExtract=toExtract)

        renameFile(file.path(folder, toExtract[[1]]), geneExpressionFile)
        renameFile(file.path(folder, toExtract[[2]]), geneInfoFile)
        renameFile(file.path(folder, toExtract[[3]]), cellLineInfoFile)
    }
    # Prepare gene expression (in wide format)
    message(sprintf("Loading CTRP gene expression from %s...",
                    geneExpressionFile))
    geneExpr <- fread(geneExpressionFile)
    wide     <- dcast(geneExpr, master_ccl_id ~ idx_gene_feature,
                      fun.aggregate=mean, value.var="mrna_expression_avg_log2")
    colnames(wide)[1] <- "cellLine"

    # Correctly set gene names
    message(sprintf("Loading CTRP gene metadata from %s...", geneInfoFile))
    geneInfo  <- fread(geneInfoFile)
    geneNames <- setNames(geneInfo$gene_primary_name, geneInfo$idx_gene_feature)
    colnames(wide)[-1]     <- geneNames[colnames(wide)[-1]]
    attr(wide, "geneInfo") <- geneInfo[geneInfo$gene_primary_name %in%
                                           colnames(wide)[-1], ]

    # Cell line metadata
    cellLineInfo <- fread(cellLineInfoFile)
    colnames(cellLineInfo)[1]  <- "cellLine"
    attr(wide, "cellLineInfo") <- cellLineInfo
    return(wide)
}

#' @rdname loadCTRPgeneExpression
#'
#' @param drugSensitivityFile Character: path to file with drug sensitivity
#' @param experimentFile Character: path to file with experiment information
#' @param compoundFile Character: path to file with compound information
#'
#' @importFrom data.table fread
#' @importFrom reshape2 dcast
#' @importFrom stats setNames
#' @importFrom R.utils renameFile
loadCTRPdrugSensitivity <- function(
    drugSensitivityFile="CTRP 2.1/drugSensitivity.txt",
    experimentFile="CTRP 2.1/experimentInfo.txt",
    compoundFile="CTRP 2.1/compoundInfo.txt") {

    if (!any(file.exists(drugSensitivityFile, experimentFile, compoundFile))) {
        url <- paste0("ftp://caftpd.nci.nih.gov/pub/OCG-DCC/CTD2/Broad/",
                      "CTRPv2.0_2015_ctd2_ExpandedDataset/",
                      "CTRPv2.0_2015_ctd2_ExpandedDataset.zip")
        folder <- dirname(drugSensitivityFile)
        toExtract <- c("v20.data.curves_post_qc.txt",
                       "v20.meta.per_experiment.txt",
                       "v20.meta.per_compound.txt")
        downloadIfNotFound(url, file.path(folder, basename(url)),
                           toExtract=toExtract)

        renameFile(file.path(folder, toExtract[[1]]), drugSensitivityFile)
        renameFile(file.path(folder, toExtract[[2]]), experimentFile)
        renameFile(file.path(folder, toExtract[[3]]), compoundFile)
    }
    message(sprintf("Loading CTRP drug sensitivity from %s...",
                    drugSensitivityFile))
    drugSensitivity <- fread(drugSensitivityFile)

    # Get cell lines from experiment file
    message(sprintf("Loading CTRP experiment metadata from %s...",
                    experimentFile))
    experimentInfo        <- fread(experimentFile)
    experimentInfo$run_id <- experimentInfo$experiment_date <- NULL

    merged <- merge(drugSensitivity, unique(experimentInfo))
    wide   <- dcast(merged, master_ccl_id ~ master_cpd_id, fun.aggregate=mean,
                    value.var="area_under_curve")
    attr(wide, "drugActivityMetric") <- "z-score(-log10(GI50))"
    attr(wide, "isDrugActivityDirectlyProportionalToSensitivity") <- TRUE
    colnames(wide)[1] <- "cellLine"

    # Correctly set compound names
    message(sprintf("Loading CTRP compound metadata from %s...", compoundFile))
    compoundInfo <- loadCTRPcompoundInfo(compoundFile)
    attr(wide, "compoundInfo") <- compoundInfo
    return(wide)
}

#' @rdname loadCTRPgeneExpression
#' @keywords internal
loadCTRPcompoundInfo <- function(compoundFile="CTRP 2.1/compoundInfo.txt") {
    compoundInfo <- fread(compoundFile)
    cols <- c("master_cpd_id"="id",
              "cpd_name"="name",
              "broad_cpd_id"="broad id",
              "cpd_smiles"="SMILES",
              "cpd_status"="FDA status",
              "target_or_activity_of_compound"="MOA",
              "gene_symbol_of_protein_target"="target")
    colnames(compoundInfo)[match(names(cols), colnames(compoundInfo))] <- cols
    colnames(compoundInfo) <- gsub("_", " ", colnames(compoundInfo))
    return(compoundInfo)
}

# Data loading from NCI60 ------------------------------------------------------

#' @rdname loadCTRPgeneExpression
#' @importFrom readxl read_excel
#' @importFrom data.table data.table transpose
#' @importFrom R.utils renameFile
loadNCI60geneExpression <- function(file="NCI60/geneExpr.xls",
                                    cellLineInfoFile="cellLineInfo.xls") {
    if (!file.exists(file)) {
        url <- paste0(
            "https://discover.nci.nih.gov/cellminerdata/normalizedArchives/",
            "nci60_RNA__RNA_seq_composite_expression.zip")
        folder <- dirname(file)
        downloadIfNotFound(url, file.path(folder, basename(url)))
        renameFile(file.path(folder, "RNA__RNA_seq_composite_expression.xls"),
                   file)
        renameFile(file.path(folder, "NCI60_CELL_LINE_METADATA.xls"),
                   cellLineInfoFile)
    }
    message(sprintf("Loading NCI60 gene expression from %s...", file))
    geneExpr <- read_excel(file, skip=10, na=c("na", "-"))

    # Convert data to numeric
    mat <- data.matrix(geneExpr[7:66])
    df  <- data.table(mat)

    # Transpose data
    trans <- cbind(colnames(mat), transpose(df))
    colnames(trans) <- c("cellLine", geneExpr[[1]])

    # Cell line metadata
    message(sprintf("Loading NCI60 cell line metadata from %s...", file))
    cellLineInfo <- read_excel(cellLineInfoFile, skip=7, n_max=60)
    # Remove footnote letters
    colnames(cellLineInfo) <- gsub(" .(,.)?$", "", colnames(cellLineInfo))
    colnames(cellLineInfo)[1] <- "cellLine"
    cellLineInfo$cellLine <- gsub(" .$", "", cellLineInfo$cellLine)

    # Fix cell names to be the exact same between datasets
    cellLineInfo$cellLine <- gsub("_", "-", cellLineInfo$cellLine)
    fix <- c("BR:HS578T"="BR:HS 578T",
             "BR:T47D"="BR:T-47D",
             "CO:COLO205"="CO:COLO 205",
             "LE:HL-60"="LE:HL-60(TB)",
             "ME:LOXIMVI"="ME:LOX IMVI",
             "LC:A549"="LC:A549/ATCC",
             "OV:NCI-ADR-RES"="OV:NCI/ADR-RES",
             "RE:RXF-393"="RE:RXF 393")
    cellLines <- match(names(fix), cellLineInfo$cellLine)
    cellLineInfo$cellLine[cellLines] <- fix
    colnames(cellLineInfo)[1] <- "cellLine"
    attr(trans, "cellLineInfo") <- cellLineInfo
    return(trans)
}

#' @inherit loadNCI60geneExpression
#'
#' @importFrom readxl read_excel
#' @importFrom data.table data.table transpose
#' @importFrom R.utils renameFile
#'
#' @keywords internal
loadNCI60drugSensitivity <- function(file="NCI60/drugSensitivity.xls") {
    if (!file.exists(file)) {
        url <- paste0("https://discover.nci.nih.gov/cellminerdata/",
                      "normalizedArchives/DTP_NCI60_ZSCORE.zip")
        toExtract <- "DTP_NCI60_ZSCORE.xls"
        folder <- dirname(file)
        downloadIfNotFound(url, file.path(folder, basename(url)))
        renameFile(file.path(folder, toExtract), file)
    }
    message(sprintf("Loading NCI60 drug sensitivity from %s...", file))
    drugSensitivity <- read_excel(file, skip=8, na=c("na", "-"))

    # Convert data to numeric
    mat <- data.matrix(drugSensitivity[7:66])
    mat <- -mat # Additive inverse of AUC values
    df  <- data.table(mat)

    # Transpose data
    trans <- cbind(colnames(mat), transpose(df))
    colnames(trans) <- c("cellLine", drugSensitivity[[1]])
    attr(trans, "drugActivityMetric") <- "-AUC (area under dose-response curve)"
    attr(trans, "isDrugActivityDirectlyProportionalToSensitivity") <- TRUE

    # Prepare compound information
    loadNCI60compoundInfo <- function(drugSensitivity) {
        # NCI60_CIDs: internal table with matches between PubChem CIDs and SIDs
        compoundInfo <- drugSensitivity[1:6]
        compoundInfo$`PubChem CID` <- NCI60_CIDs$CID[match(
            compoundInfo$`PubChem SID`, NCI60_CIDs$SID)]
        colnames(compoundInfo) <- gsub(" [a-z]$", "", colnames(compoundInfo))
        compoundInfo <- compoundInfo[c(1:5, 7, 6)]

        cols <- c("NSC #"="id", "Drug name"="name", "Mechanism of action"="MOA")
        colnames(compoundInfo)[match(names(cols),
                                     colnames(compoundInfo))] <- cols
        return(compoundInfo)
    }
    compoundInfo <- loadNCI60compoundInfo(drugSensitivity)
    attr(trans, "compoundInfo") <- compoundInfo
    return(trans)
}

# Data loading from GDSC -------------------------------------------------------

#' @rdname loadCTRPgeneExpression
#' @importFrom readxl read_excel
#'
#' @param file Character: file path
loadGDSC7file <- function(file, filename, type, ...) {
    if (!file.exists(file)) {
        url <- paste0(
            "ftp://ftp.sanger.ac.uk/pub/project/cancerrxgene/releases/",
            "release-7.0/", filename)
        downloadIfNotFound(url, file)
    }
    message(sprintf("Loading GDSC %s from %s...", type, file))
    data <- read_excel(file, ...)
    return(data)
}

#' @rdname loadCTRPgeneExpression
loadGDSC7cellLineInfo <- function(file="GDSC_7/cellLineInfo.xlsx") {
    cellLineInfo <- loadGDSC7file(file, "Cell_Lines_Details.xlsx",
                                  "cell line information", sheet=2)
    colnames(cellLineInfo)[2] <- "cellLine"
    return(cellLineInfo)
}

#' @rdname loadCTRPgeneExpression
loadGDSC7compoundInfo <- function(file="GDSC_7/compoundInfo.xlsx") {
    compoundInfo <- loadGDSC7file(file, "Screened_Compounds.xlsx",
                                 "compound information")
    cols <- c("DRUG_ID"="id", "DRUG_NAME"="name")
    colnames(compoundInfo)[match(names(cols), colnames(compoundInfo))] <- cols
    colnames(compoundInfo) <- gsub("_", " ", tolower(colnames(compoundInfo)))
    return(compoundInfo)
}

#' @rdname loadCTRPgeneExpression
#' @importFrom data.table fread transpose
loadGDSC7geneExpression <- function(file="GDSC_7/geneExpr.txt") {
    if (!file.exists(file)) {
        url <- paste0(
            "ftp://ftp.sanger.ac.uk/pub/project/cancerrxgene/releases/",
            "release-7.0/sanger1018_brainarray_ensemblgene_rma.txt.gz")
        downloadIfNotFound(url, file)
    }
    message(sprintf("Loading GDSC gene expression from %s...", file))
    geneExpr <- fread(file, header=TRUE)

    gext  <- transpose(geneExpr[ , -1])
    gext2 <- cbind(colnames(geneExpr)[-1], gext)
    colnames(gext2) <- c("cellLine", geneExpr[[1]])

    # Convert from ENSEMBL to gene symbols
    colnames(gext2) <- convertGeneIdentifiers(colnames(gext2))
    return(gext2)
}

#' @rdname loadCTRPgeneExpression
#' @importFrom reshape2 dcast
loadGDSC7drugSensitivity <- function(file="GDSC_7/drugs.xlsx") {
    drugSensitivity <- loadGDSC7file(file, "v17.3_fitted_dose_response.xlsx",
                                     "drug sensitivity")
    # Convert to wide format: compound (rows) vs cell lines (columns)
    drugSensitivity$minus_LN_IC50 <- -drugSensitivity$LN_IC50
    wide <- dcast(drugSensitivity, COSMIC_ID ~ DRUG_ID, fun.aggregate=mean,
                  value.var="minus_LN_IC50")
    colnames(wide)[1] <- "cellLine"
    attr(wide, "drugActivityMetric") <- "-log(IC50)"
    attr(wide, "isDrugActivityDirectlyProportionalToSensitivity") <- TRUE
    return(wide)
}

# Correlate gene expression and drug sensitivity -------------------------------

#' @importFrom stats cor
correlateGEandDrugSensitivity <- function(geneExpr, drugSensitivity,
                                          method="spearman") {
    message("Correlating gene expression and drug sensitivity...")
    # Match cell lines between datasets
    cellLines       <- intersect(geneExpr$cellLine, drugSensitivity$cellLine)
    geneExpr        <- geneExpr[match(cellLines, geneExpr$cellLine), ]
    drugSensitivity <- drugSensitivity[match(cellLines,
                                             drugSensitivity$cellLine), ]
    time <- Sys.time()
    res <- cor(geneExpr[ , -1], drugSensitivity[ , -1], method=method,
               use="pairwise.complete.obs")
    diffTime <- format(round(Sys.time() - time, 2))
    msg <- "Gene expression and drug sensitivity correlation performed in %s\n"
    message(sprintf(msg, diffTime))

    attr(res, "cellLines") <- cellLines
    attr(res, "method")    <- method
    attr(res, "type")      <- "compounds"
    attr(res, "date")      <- Sys.time()
    attr(res, "runtime")   <- diffTime
    return(res)
}

#' Prepare gene expression and drug sensitivity correlation matrix
#'
#' @param dataset Character: dataset to use (\code{CTRP}, \code{GDSC} or
#'   \code{NCI60})
#' @param method Character: correlation method to use between gene expression
#'   and drug sensitivity
#'
#' @details If path directs to non-existing files, data will be downloaded.
#'
#' @return Correlation matrix between gene expression and drug sensitivity
#' @keywords internal
prepareExpressionDrugSensitivityAssociation <- function(
    dataset=c("GDSC 7", "CTRP 2.1", "NCI60"), method="spearman") {

    dataset <- match.arg(dataset)

    source <- dataset
    if (dataset == "GDSC 7") {
        geneExpr        <- loadGDSC7geneExpression()
        geneInfo        <- NULL
        cellLineInfo    <- loadGDSC7cellLineInfo()
        drugSensitivity <- loadGDSC7drugSensitivity()
        compoundInfo    <- loadGDSC7compoundInfo()
    } else if (dataset == "CTRP 2.1") {
        geneExpr        <- loadCTRPgeneExpression()
        geneInfo        <- attr(geneExpr, "geneInfo")
        cellLineInfo    <- attr(geneExpr, "cellLineInfo")
        drugSensitivity <- loadCTRPdrugSensitivity()
        compoundInfo    <- attr(drugSensitivity, "compoundInfo")
    } else if (dataset == "NCI60") {
        geneExpr        <- loadNCI60geneExpression()
        geneInfo        <- NULL
        cellLineInfo    <- attr(geneExpr, "cellLineInfo")
        drugSensitivity <- loadNCI60drugSensitivity()
        compoundInfo    <- attr(drugSensitivity, "compoundInfo")
    }
    cor <- correlateGEandDrugSensitivity(geneExpr, drugSensitivity, method)

    # Include metadata
    cellLines    <- intersect(geneExpr$cellLine, drugSensitivity$cellLine)
    cellLineInfo <- cellLineInfo[cellLineInfo$cellLine %in% cellLines, ]
    attr(cor, "cellLineInfo") <- cellLineInfo

    attr(cor, "geneInfo")     <- geneInfo
    attr(cor, "compoundInfo") <- compoundInfo
    attr(cor, "source")       <- source
    attr(cor, "drugActivityMetric") <- attr(drugSensitivity,
                                            "drugActivityMetric")
    attr(cor, "isDrugActivityDirectlyProportionalToSensitivity") <- attr(
        drugSensitivity, "isDrugActivityDirectlyProportionalToSensitivity")
    return(cor)
}

#' @keywords internal
#' @importFrom utils askYesNo
#' @importFrom rhdf5 h5createFile h5write h5createDataset
writeExpressionDrugSensitivityCorHDF5 <- function(
    data, filename="expressionDrugSensitivityCorNCI60.h5") {
    # Create file with given name
    if (file.exists(filename)) {
        continue <- askYesNo(paste(filename, "already exists. Overwrite file?"),
                             default=FALSE)
        if (continue) {
            message("Overwriting file...")
        } else {
            return(invisible(FALSE))
        }
    }
    time <- Sys.time()
    unlink(filename)
    h5createFile(filename)

    attrs <- attributes(data)
    classes <- sapply(sapply(attributes(data), class), "[[", 1)
    attr(data, "classes") <- unname(cbind(names(classes), classes))

    # Save data.frame as datasets (data.frame attributes are unsupported)
    attrsDF <- names(attrs)[sapply(attrs, is.data.frame)]
    attributes(data)[attrsDF] <- NULL
    for (k in attrsDF) h5write(attrs[[k]], filename, k)

    # Save dim names as datasets (large character attributes are unsupported)
    h5write(colnames(data), filename, "colnames")
    h5write(rownames(data), filename, "rownames")
    attr(data, "dimnames") <- NULL

    # Convert logical values to 1 or 0 (logical attributes are unsupported)
    attributes(data)[sapply(attributes(data), isTRUE)]  <- 1
    attributes(data)[sapply(attributes(data), isFALSE)] <- 0

    # Convert date to string (POSIXt attributes are unsupported)
    attr(data, "date") <- as.character(attr(data, "date"))

    # Split data in appropriate chunks (faster reading of smaller subsets)
    chunk <- c( nrow(data), ceiling(ncol(data) / 10) )
    h5createDataset(filename, "data", dims=dim(data), chunk=chunk)
    h5write(data, filename, "data", write.attributes=TRUE)
    message(paste("HDF5 file written in", format(round(Sys.time() - time, 2))))
    return(invisible(TRUE))
}

# Load gene expression/drug sensitivity associations ---------------------------

#' List available gene expression and drug sensitivity correlation matrices
#'
#' @param url Boolean: return download link?
#'
#' @family functions related with the prediction of targeting drugs
#' @return Character vector of available gene expression and drug sensitivity
#'   correlation matrices
#' @export
#'
#' @examples
#' listExpressionDrugSensitivityAssociation()
listExpressionDrugSensitivityAssociation <- function(url=FALSE) {
    options <- c("GDSC 7"="expressionDrugSensitivityCorGDSC7.qs",
                 "CTRP 2.1"="expressionDrugSensitivityCorCTRP2.1.qs",
                 "NCI60"="expressionDrugSensitivityCorNCI60.h5")
    link <- file.path("https://compbio.imm.medicina.ulisboa.pt/public/cTRAP",
                      options)
    names(link) <- names(options)

    res <- link
    if (!url) res <- names(res)
    return(res)
}

#' @keywords internal
#' @importFrom tibble tibble
#' @importFrom rhdf5 h5ls h5read h5readAttributes h5closeAll
readExpressionDrugSensitivityCorHDF5 <- function(
    filename="expressionDrugSensitivityCorNCI60.h5", rows=NULL, cols=NULL,
    loadValues=FALSE, verbose=FALSE) {

    # Check which datasets are attributes or not
    info   <- h5ls(filename)
    isAttr <- info[["name"]] != "data"

    # Match character rows/cols with respective index
    readDimNames <- function(filename, type, index=NULL) {
        attr <- h5read(filename, type)
        if (!is.null(index)) attr <- attr[index]
        return(as.character(attr))
    }
    matchIndex <- function(dim, type) {
        if (is.character(dim)) {
            dim <- sort(unique(match(dim, readDimNames(filename, type))))
        }
        return(dim)
    }
    if (is.null(rows)) rows <- rownames(filename)
    rows <- matchIndex(rows, "rownames")
    if (is.null(cols)) cols <- colnames(filename)
    cols <- matchIndex(cols, "colnames")

    if (verbose) {
        msg <- paste("Loading gene expresion and drug sensitivity correlation",
                     "matrix from %s (%s rows x %s cols)...")
        nrows <- length(rows)
        ncols <- length(cols)
        if (nrows == 0) nrows <- length(readDimNames(filename, "rownames"))
        if (ncols == 0) ncols <- length(readDimNames(filename, "colnames"))
        message(sprintf(msg, filename, nrows, ncols))
    }

    # Read data from file
    name <- "data"
    attrs <- h5readAttributes(filename, name)
    if (loadValues) {
        data <- h5read(filename, name, index=list(rows, cols),
                       read.attributes=TRUE)
    } else {
        data <- filename
        mostattributes(data) <- attrs
    }

    # Convert attributes to original classes
    classes <- attr(data, "classes")
    classes <- setNames(classes[ , 2], classes[ , 1])
    classes <- classes[names(classes) %in% names(attributes(data))]
    attr(data, "classes") <- NULL

    for (k in seq(classes)) {
        this <- classes[[k]]
        name <- names(classes[k])
        if ( this == "character" ) {
            attr(data, name) <- as.character(attr(data, name))
        } else if ( this == "POSIXct" ) {
            attr(data, name) <- as.POSIXct(attr(data, name))
        } else if ( this == "logical" ) {
            attr(data, name) <- as.logical(attr(data, name))
        }
    }

    # Convert COMPOUND (data.frames) to tibbles and add them as attributes
    attrsDF <- info[["name"]][info[["dclass"]] == "COMPOUND"]
    for (i in attrsDF) {
        attr(data, i) <- suppressWarnings(tibble(h5read(filename, i)))
    }

    # Add rownames and colnames as attributes
    attr(data, "compounds") <- readDimNames(filename, "colnames", cols)
    attr(data, "genes")     <- readDimNames(filename, "rownames", rows)
    if (loadValues) {
        colnames(data) <- attr(data, "compounds")
        rownames(data) <- attr(data, "genes")
    }
    h5closeAll()
    return(data)
}

#' Operations on \code{expressionDrugSensitivityAssociation} objects
#' @param x An \code{expressionDrugSensitivityAssociation} object
#' @return Subset, dimension or dimension names
#' @export
dimnames.expressionDrugSensitivityAssociation <- function(x) {
    if (is.character(x)) {
        res <- list(attr(x, "genes"), attr(x, "compounds"))
    } else {
        res <- NextMethod("dimnames")
    }
    return(res)
}

#' @rdname dimnames.expressionDrugSensitivityAssociation
#' @export
dim.expressionDrugSensitivityAssociation <- function(x) {
    if (is.character(x)) {
        res <- vapply(dimnames(x), length, numeric(1))
    } else {
        res <- NextMethod("dim")
    }
    return(res)
}

#' @rdname dimnames.expressionDrugSensitivityAssociation
#'
#' @param i,j Character or numeric indexes specifying elements to extract
#' @param drop Boolean: coerce result to the lowest possible dimension?
#' @param ... Extra arguments given to other methods
#'
#' @export
`[.expressionDrugSensitivityAssociation` <- function(x, i, j, drop=FALSE, ...) {
    if (is.character(x)) {
        out <- subsetData(x, i, j, "genes", "compounds", nargs(), ...)
    } else {
        out <- NextMethod("[", drop=drop)
    }
    return(out)
}

#' Load gene expression and drug sensitivity correlation matrix
#'
#' @param source Character: source of matrix to load; see
#'   \code{\link{listExpressionDrugSensitivityAssociation}}
#' @param file Character: filepath to gene expression and drug sensitivity
#'   association dataset (automatically downloaded if file does not exist)
#' @param path Character: folder where to find files (optional; \code{file} may
#'   contain the full filepath if preferred)
#' @param rows Character or integer: rows
#' @param cols Character or integer: columns
#' @param loadValues Boolean: load data values (if available)? If \code{FALSE},
#'   downstream functions will load and process directly from the file chunk by
#'   chunk, resulting in a lower memory footprint
#'
#' @family functions related with the prediction of targeting drugs
#' @return Correlation matrix between gene expression (rows) and drug
#'   sensitivity (columns)
#' @export
#'
#' @importFrom tools file_ext
#' @importFrom qs qread
#'
#' @examples
#' gdsc <- listExpressionDrugSensitivityAssociation()[[1]]
#' loadExpressionDrugSensitivityAssociation(gdsc)
loadExpressionDrugSensitivityAssociation <- function(
    source, file=NULL, path=NULL, rows=NULL, cols=NULL, loadValues=FALSE) {
    
    available <- listExpressionDrugSensitivityAssociation(url=TRUE)
    source    <- match.arg(source, names(available))
    link      <- available[source]

    if (is.null(file)) file <- gsub("\\?.*", "", basename(link))
    if (!is.null(path)) file <- file.path(path, file)
    downloadIfNotFound(link, file)
    message(sprintf("Loading data from %s...", file))
    if (file_ext(file) == "h5") {
        cor <- readExpressionDrugSensitivityCorHDF5(file, rows=rows, cols=cols,
                                                    loadValues=loadValues)
    } else {
        if (is.null(cols)) cols <- TRUE
        if (is.null(rows)) rows <- TRUE
        
        if (file_ext(file) == "rds") {
            res <- readRDS(file)
        } else if (file_ext(file) == "qs") {
            res <- qread(file)
        } 
        cor <- res[rows, cols, drop=FALSE]
        attrs <- attributes(res)
        attrs <- attrs[!names(attrs) %in% names(attributes(cor))]
        attributes(cor) <- c(attributes(cor), attrs)
    }
    attr(cor, "filename") <- normalizePath(file)
    class(cor) <- c("expressionDrugSensitivityAssociation", class(cor))
    return(cor)
}

# Predict targeting drugs ------------------------------------------------------

#' Predict targeting drugs
#'
#' Identify compounds that may target the phenotype associated with a
#' user-provided differential expression profile by comparing such against a
#' correlation matrix of gene expression and drug sensitivity.
#'
#' @inherit rankAgainstReference
#' @param expressionDrugSensitivityCor Matrix or character: correlation matrix
#'   of gene expression (rows) and drug sensitivity (columns) across cell lines
#'   or path to file containing such data; see
#'   \code{\link{loadExpressionDrugSensitivityAssociation}()}.
#' @param isDrugActivityDirectlyProportionalToSensitivity Boolean: are the
#'   values used for drug activity directly proportional to drug sensitivity?
#'   If \code{NULL}, the argument \code{expressionDrugSensitivityCor} must have
#'   a non-\code{NULL} value for attribute
#'   \code{isDrugActivityDirectlyProportionalToSensitivity}.
#'
#' @family functions related with the prediction of targeting drugs
#' @export
#'
#' @examples
#' # Example of a differential expression profile
#' data("diffExprStat")
#'
#' # Load expression and drug sensitivity association derived from GDSC data
#' gdsc <- loadExpressionDrugSensitivityAssociation("GDSC 7")
#'
#' # Predict targeting drugs on a differential expression profile
#' predictTargetingDrugs(diffExprStat, gdsc)
predictTargetingDrugs <- function(
    input, expressionDrugSensitivityCor,
    method=c("spearman", "pearson", "gsea"), geneSize=150,
    isDrugActivityDirectlyProportionalToSensitivity=NULL, threads=1, chunkGiB=1,
    verbose=FALSE) {

    cellLines <- length(attr(expressionDrugSensitivityCor, "cellLines"))

    # Check if drug metric is proportional to sensitivity
    if (is.null(isDrugActivityDirectlyProportionalToSensitivity)) {
        isDrugActivityDirectlyProportionalToSensitivity <- attr(
            expressionDrugSensitivityCor,
            "isDrugActivityDirectlyProportionalToSensitivity")
    }

    if (is.null(isDrugActivityDirectlyProportionalToSensitivity)) {
        stop("attribute 'isDrugActivityProportionalToSensitivity' for argument",
             " 'expressionDrugSensitivityCor' cannot be NULL")
    }

    rankedDrugs <- rankAgainstReference(
        input, expressionDrugSensitivityCor, method=method, geneSize=geneSize,
        cellLines=cellLines, cellLineMean=FALSE, threads=threads,
        chunkGiB=chunkGiB, verbose=verbose,
        rankByAscending=isDrugActivityDirectlyProportionalToSensitivity)
    colnames(rankedDrugs)[[1]] <- "compound"

    # Inherit input settings and other relevant information
    attr(rankedDrugs, "expressionDrugSensitivityCor") <- attributes(
        expressionDrugSensitivityCor)
    class(rankedDrugs) <- c("targetingDrugs", class(rankedDrugs))
    return(rankedDrugs)
}

#' @importFrom ggplot2 ggplot geom_point geom_rug geom_density_2d xlab ylab
#'   theme_bw theme
#'
#' @keywords internal
plotTargetingDrug <- function(x, drug, method=c("spearman", "pearson", "gsea"),
                              genes=c("both", "top", "bottom"),
                              data=NULL, title=NULL) {
    method <- match.arg(method)
    drug <- as.character(drug)

    if (is.null(data)) {
        info <- attr(x, "expressionDrugSensitivityCor")
        data <- loadExpressionDrugSensitivityAssociation(info$source,
                                                         info$filename)
    }
    cor   <- data[ , drug, drop=TRUE]
    input <- attr(x, "input")

    if (method %in% c("spearman", "pearson")) {
        if (is.character(cor)) {
            intersecting <- intersect(names(input), rownames(cor))
            val <- readExpressionDrugSensitivityCorHDF5(
                cor[intersecting, drug], loadValues=TRUE, verbose=FALSE)
            val <- val[intersecting, 1, drop=TRUE]
        } else {
            intersecting <- intersect(names(input), names(cor))
            val          <- cor[intersecting]
        }
        df <- data.frame(input=input[intersecting], cor=val)

        plot <- ggplot(df, aes(.data[["input"]], .data[["cor"]])) +
            geom_point(alpha=0.1, na.rm=TRUE) +
            geom_rug(alpha=0.1) +
            geom_density_2d(na.rm=TRUE) +
            xlab("Differentially expressed genes (input)") +
            ylab(drug) +
            theme_bw() +
            theme(legend.position="bottom")
        if (!is.null(title)) {
            plot <- plot +
                ggtitle(title) +
                theme(plot.title = element_text(hjust = 0.5))
        }
        attr(plot, "data") <- cbind("Genes"=rownames(df), df)
    } else if (method == "gsea") {
        geneset <- c(attr(x, "pathways"), # legacy
                     attr(x, "geneset"))
        if (is.null(title)) title <- drug
        plot <- plotGSEA(cor, geneset, genes, title=title)
    }
    return(plot)
}
nuno-agostinho/cTRAP documentation built on March 28, 2024, 3:59 p.m.