R/utilities.R

Defines functions list_rev list2df addGESSannot addMOA add_pcid set_readable read_gmt load_OrgDb tarReduce vec_char_redu trts_check deprof2subexpr getSPsubSig getPCut getDEGSig getSig select_ont gs2mat gmt2h5 meanExpr2h5 gctx2h5 append2H5 create_empty_h5 matrix2h5 sep_pcf

Documented in addGESSannot addMOA add_pcid append2H5 create_empty_h5 gctx2h5 getDEGSig getSig getSPsubSig gmt2h5 list2df list_rev matrix2h5 meanExpr2h5 read_gmt set_readable tarReduce vec_char_redu

sep_pcf <- function(res){
    new <- as.data.frame(t(vapply(seq_len(nrow(res)), function(i)
        unlist(strsplit(as.character(res$set[i]), "__")), 
        FUN.VALUE=character(3))), stringsAsFactors=FALSE)
    colnames(new) <- c("pert", "cell", "type")
    res <- cbind(new, res[,-1])
    return(res)
}

# readHDF5mat <- function(h5file, colindex=seq_len(10)) {
#     m <- h5read(h5file, "assay", index=list(NULL, colindex))
#     mycol <- h5read(h5file, "colnames", index=list(colindex, 1))
#     myrow <- h5read(h5file, "rownames")
#     h5closeAll()
#     rownames(m) <- as.character(myrow[,1])
#     colnames(m) <- as.character(mycol[,1])
#     return(m)
# }

# getH5dim <- function(h5file){
#     mat_dim <- h5ls(h5file)$dim[1]
#     mat_ncol <- as.numeric(gsub(".* x ","", mat_dim))
#     mat_nrow <- as.numeric(gsub(" x .*","", mat_dim))
#     return(c(mat_nrow, mat_ncol))
# }

#' Write Matrix to HDF5 file
#' 
#' Function writes matrix object to an HDF5 file.
#' @param matrix matrix to be written to HDF5 file, row and column name slots 
#' need to be populated
#' @param h5file character(1), path to the hdf5 destination file
#' @param name The name of the dataset in the HDF5 file. The default is write
#' the score matrix (e.g. z-score, logFC) to the 'assay' dataset, users could 
#' also write the adjusted p-value or FDR matrix to the 'padj' dataset by 
#' setting the \code{name} as 'padj'.
#' @param overwrite TRUE or FALSE, whether to overwrite or append
#' matrix to an existing 'h5file'
#' @return HDF5 file containing exported matrix
#' @examples 
#' mat <- matrix(rnorm(12), nrow=3, dimnames=list(
#'               paste0("r",1:3), paste0("c",1:4)))
#' h5file <- tempfile(fileext=".h5")
#' matrix2h5(matrix=mat, h5file=h5file, overwrite=TRUE)
#' @export
matrix2h5 <- function(matrix, h5file, name="assay", overwrite=TRUE){
    create_empty_h5(h5file, delete_existing=overwrite)
    append2H5(matrix, h5file, name=name)
}

#' Create Empty HDF5 File 
#' 
#' This function can be used to create an empty HDF5 file where the user defines
#' the file path and compression level. The empty HDF5 file has under its root
#' group three data slots named 'assay', 'colnames' and 'rownames' for storing a
#' \code{numeric matrix} along with its column names (\code{character}) and row 
#' names (\code{character}), respectively.
#' 
#' @param h5file character(1), path to the HDF5 file to be created
#' @param delete_existing logical, whether to delete an existing HDF5 file with 
#' identical path
#' @param level The compression level used, here given as integer value between 
#' 0 (no compression) and 9 (highest and slowest compression).
#' @return empty HDF5 file
#' @examples
#' tmp_file <- tempfile(fileext=".h5")
#' create_empty_h5(tmp_file, level=6)
#' @export
create_empty_h5 <- function(h5file, delete_existing=FALSE, level=6) {
    if(delete_existing) unlink(h5file)
    if(! file.exists(h5file)){
        h5createFile(file=h5file)
        h5createDataset(h5file, "assay", c(0,0), c(H5Sunlimited(), H5Sunlimited()), 
                        chunk=c(12328,1), level=level)
        h5createDataset(h5file, "padj", c(0,0), c(H5Sunlimited(), H5Sunlimited()), 
                        chunk=c(12328,1), level=level)
        h5createDataset(h5file, "colnames", c(0,1), c(H5Sunlimited(), 1), 
                        storage.mode='character', size=200, level=level)
        h5createDataset(h5file, "rownames", c(0,1), c(H5Sunlimited(), 1), 
                        storage.mode='character', size=40, level=level)
    }
    h5closeAll()
}

#' Append Matrix to HDF5 File
#' 
#' Function to write matrix data to an existing HDF5 file. If the file contains 
#' already matrix data then both need to have the same number of rows. The 
#' append will be column-wise.
#' @param x matrix object to write to an HDF5 file. If the HDF5 file is not 
#' empty, the exported matrix data needs to have the same number rows as the 
#' matrix stored in the HDF5 file, and will be appended column-wise to the 
#' existing one.
#' @param h5file character(1), path to existing HDF5 file that can be empty or
#' contain matrix data
#' @param name The name of the dataset in the HDF5 file.
#' @param printstatus logical, whether to print status
#' @return HDF5 file storing exported matrix
#' @examples 
#' mat <- matrix(1:12, nrow=3)
#' rownames(mat) <- paste0("r", 1:3); colnames(mat) <- paste0("c", 1:4)
#' tmp_file <- tempfile(fileext=".h5")
#' create_empty_h5(tmp_file)
#' append2H5(mat, tmp_file)
#' rhdf5::h5ls(tmp_file)
#' @export
append2H5 <- function(x, h5file, name="assay", printstatus=TRUE) {
    status <- h5ls(h5file)[c("name", "dim")]
    rowstatus <- as.numeric(gsub(" x \\d{1,}$", "", 
                                 status[status$name==name, "dim"]))
    colstatus <- as.numeric(gsub("^\\d{1,} x ", "", 
                                 status[status$name==name, "dim"]))
    nrows <- nrow(x) 
    ncols <- colstatus + ncol(x)
    h5set_extent(h5file, name, c(nrows, ncols))
    h5write(x, h5file, name, index=list(seq_len(nrows), (colstatus+1):ncols))
    # only stores rownames and colnames of 'assay' dataset
    if(name=="assay"){
        h5set_extent(h5file, "colnames", c(ncols,1))
        h5write(colnames(x), h5file, "colnames", index=list((colstatus+1):ncols, 1))
        if(any(duplicated(h5read(h5file, "colnames")[,1]))) 
            warning("Column names contain duplicates!")
        
        h5set_extent(h5file, "rownames", c(nrows,1))
        h5write(rownames(x), h5file, "rownames", index=list(seq_len(nrows), 1))
        if(any(duplicated(h5read(h5file, "rownames")[,1]))) 
            warning("Row names contain duplicates!")
    }
    
    if(printstatus) h5ls(h5file, all=TRUE)[c("dim", "maxdim")]
    h5closeAll()
}

#' Read matrix-like data from large gctx file in chunks and write result back 
#' to an HDF5 file.
#' @title Convert GCTX to HDF5 File
#' @param gctx character(1), path to gctx file from LINCS
#' @param cid character or integer vector referencing the
#' columns of the matrix to include
#' @param new_cid character vector of the same length as cid, assigning new
#' column names to matrix
#' @param h5file character(1), path of the hdf5 destination file
#' @param by_ncol number of columns to import in each iteration to limit 
#' memory usage
#' @param overwrite TRUE or FALSE, whether to overwrite or to append to 
#' existing 'h5file'
#' @return HDF5 file
#' @import rhdf5
#' @examples 
#' gctx <- system.file("extdata", "test_sample_n2x12328.gctx", 
#'         package="signatureSearch")
#' h5file <- tempfile(fileext=".h5")
#' gctx2h5(gctx, cid=1:2, 
#'         new_cid=c('sirolimus__MCF7__trt_cp', 'vorinostat__SKB__trt_cp'), 
#'         h5file=h5file, overwrite=TRUE)
#' @export
#' 
gctx2h5 <- function(gctx, cid, new_cid=cid, h5file, by_ncol=5000, 
                    overwrite=TRUE){
    cid_list <- suppressWarnings(
        split(cid, rep(seq_len(ceiling(length(cid)/by_ncol)), 
                       each=by_ncol)))
    new_cid_list <- suppressWarnings(
        split(new_cid, rep(seq_len(ceiling(length(new_cid)/by_ncol)), 
                           each=by_ncol)))
    create_empty_h5(h5file, delete_existing=overwrite)
    lapply(seq_along(cid_list), function(i){
        mat <- parse_gctx(gctx, cid=cid_list[[i]], matrix_only=TRUE)
        mat <- mat@mat
        colnames(mat) <- new_cid_list[[i]]
        append2H5(x=mat, h5file, printstatus=FALSE)
    })    
    h5ls(h5file)
}

#' Calculate Mean Expression Values of LINCS Level 3 Data
#' 
#' Function calculates mean expression values for replicated samples of LINCS
#' Level 3 data that have been treated by the same compound in the same cell type
#' at a chosen concentration and treatment time. Usually, the function is used
#' after filtering the Level 3 data with \code{inst_filter}. The results (here
#' matrix with mean expression values) are saved to an HDF5 file. The latter is
#' referred to as the `lincs_expr` database.
#' @param gctx character(1), path to the LINCS Level 3 gctx file
#' @param inst tibble, LINCS Level 3 instances after filtering for specific concentrations and times
#' @param h5file character(1), path to the destination HDF5 file
#' @param chunksize number of columns of the matrix to be processed at a time to limit memory usage
#' @param overwrite TRUE or FALSE, whether to overwrite or append data to an
#' existing 'h5file'
#' @return HDF5 file, representing the \code{lincs_expr} database
#' @examples
#' gctx <- system.file("extdata", "test_sample_n2x12328.gctx", package="signatureSearch")
#' h5file <- tempfile(fileext=".h5")
#' inst <- data.frame(inst_id=c("ASG001_MCF7_24H:BRD-A79768653-001-01-3:10",
#'                              "CPC012_SKB_24H:BRD-K81418486:10"), 
#'     pert_cell_factor=c('sirolimus__MCF7__trt_cp', 'vorinostat__SKB__trt_cp'))
#' meanExpr2h5(gctx, inst, h5file, overwrite=TRUE)
#' @export
#' 
meanExpr2h5 <- function(gctx, inst, h5file, chunksize=2000, overwrite=TRUE){
    ## Get mean expression values of replicate samples from the same drug treatment in cells
    pert_cell_list <- split(inst$inst_id, inst$pert_cell_factor)
    chunk_list <- suppressWarnings(
        split(pert_cell_list, rep(seq_len(ceiling(length(pert_cell_list)/chunksize)), each=chunksize)))
    ## Creat an empty h5file
    if(file.exists(h5file)){
        if(overwrite){
            create_empty_h5(h5file, delete_existing=TRUE)
        }
    } else {
        create_empty_h5(h5file, delete_existing=FALSE)
    }
    ## append mean expr mat in each chunk to h5file
    lapply(chunk_list, function(pcl){
        cid_all <- unlist(pcl)
        mat_all <- parse_gctx(gctx, cid=cid_all, matrix_only=TRUE)
        mat_all <- mat_all@mat
        expr <- vapply(pcl, function(cid, mat_all){
            rowMeans(as.matrix(mat_all[,cid]))
        }, mat_all=mat_all, FUN.VALUE=numeric(12328))
        append2H5(x=expr, h5file, printstatus=FALSE)
    })
    h5ls(h5file)
}

#' Read gene sets from large gmt file in batches, convert the gene sets to
#' 01 matrix and write the result to an HDF5 file.
#' @title Convert GMT to HDF5 File
#' @param gmtfile character(1), path to gmt file containing gene sets
#' @param dest_h5 character(1), path of the hdf5 destination file
#' @param by_nset number of gene sets to import in each iteration to limit 
#' memory usage
#' @param overwrite TRUE or FALSE, whether to overwrite or to append to 
#' existing 'h5file'
#' @return HDF5 file
#' @examples 
#' gmt <- system.file("extdata", "test_gene_sets_n4.gmt", 
#'         package="signatureSearch")
#' h5file <- tempfile(fileext=".h5")
#' gmt2h5(gmtfile=gmt, dest_h5=h5file, overwrite=TRUE)
#' @export
gmt2h5 <- function(gmtfile, dest_h5, by_nset=5000, overwrite=FALSE){
    # get number of lines of gmtfile
    wc <- system(paste("wc -l", gmtfile), intern = TRUE)
    nline <- as.numeric(gsub(pattern=gmtfile, "", wc,fixed = TRUE))
    ceil <- ceiling(nline/by_nset)
    
    if(file.exists(dest_h5) & !overwrite){
        message(paste(dest_h5, "already exists!"))
    } else {
        create_empty_h5(dest_h5, delete_existing=TRUE)
    }
    
    # get all gene identifiers in gmtfile
    all_genes <- NULL
    for(i in seq_len(ceil)){
        gene_sets <- suppressWarnings(read_gmt(gmtfile, 
                            start=by_nset*(i-1)+1, end=by_nset*i))
        tmp <- unique(unlist(gene_sets))
        all_genes <- unique(c(all_genes, tmp))
    }
            
    # read in gene sets in batches
    for(i in seq_len(ceil)){
        gene_sets <- suppressWarnings(read_gmt(gmtfile, 
                        start=by_nset*(i-1)+1, end=by_nset*i))
        # transform gene sets to sparseMatrix
        mat <- gs2mat(gene_sets)
        miss_genes <- setdiff(all_genes, rownames(mat))
        patch <- matrix(0, nrow=length(miss_genes), ncol=ncol(mat))
        rownames(patch) <- miss_genes
        colnames(patch) <- colnames(mat)
        mat <- rbind(mat, patch)
        append2H5(x=mat[all_genes,], dest_h5, printstatus=FALSE)
    }
    h5ls(dest_h5)
}

gs2mat <- function(gene_sets){
    gsc <- GeneSetCollection(mapply(function(geneIds, setId) {
        GeneSet(geneIds, geneIdType=EntrezIdentifier(),
                setName=setId)
    }, gene_sets, names(gene_sets)))
    mat <- incidence(gsc)
    mat <- Matrix::t(mat)
    return(mat)
}

# readHDF5chunk <- function(h5file, colindex=seq_len(10), colnames=NULL) {
#     if(! is.null(colnames)){
#         all_trts <- h5read(h5file, "colnames", drop=TRUE)
#         colindex2 <- which(all_trts %in% colnames)
#         m <- h5read(h5file, "assay", index=list(NULL, colindex2))
#         colindex <- colindex2
#     } else {
#         m <- h5read(h5file, "assay", index=list(NULL, colindex))
#     }
#     mycol <- h5read(h5file, "colnames", index=list(colindex, 1))
#     myrow <- h5read(h5file, "rownames")
#     rownames(m) <- as.character(myrow[,1])
#     colnames(m) <- as.character(mycol[,1])
#     if(! is.null(colnames)){
#         m <- m[,colnames, drop=FALSE]
#     }
#     se <- SummarizedExperiment(assays=list(score=m))
#     h5closeAll()
#     return(se)
# }

select_ont <- function(res, ont, GO_DATA){
    # Add and select ontology in res
    res <- add_GO_Ontology(res, GO_DATA)
    tmp_df <- result(res)
    colnames(tmp_df)[1] = "ont"
    tmp_df$ont = as.character(tmp_df$ont)
    result(res) <- tmp_df
    if(ont != "ALL")
        result(res) <- as_tibble(res[res$ont == ont, ])
    return(res)
}

#' Functionalities used to draw from reference database 
#' (e.g. lincs, lincs_expr) GESs of compound treatment(s) in cell types.
#' 
#' The GES could be genome-wide differential expression profiles (e.g. log2 
#' fold changes or z-scores) or normalized gene expression intensity values 
#' depending on the data type of \code{refdb} or n top up/down regulated DEGs
#' @title Draw GESs from Reference Database 
#' @rdname getSig
#' @param cmp character vector representing a list of compound name available 
#' in \code{refdb} for \code{getSig} function, or character(1) indicating a
#' compound name (e.g. vorinostat) for other functions
#' @param cell character(1) or character vector of the same length as cmp 
#' argument. It indicates cell type that the compound treated in
#' @param refdb character(1), one of "lincs", "lincs_expr", "cmap", "cmap_expr",
#' or path to the HDF5 file built from \code{\link{build_custom_db}} function
#' @return matrix representing genome-wide GES of the query compound(s) in cell
#' @examples 
#' refdb <- system.file("extdata", "sample_db.h5", package = "signatureSearch")
#' vor_sig <- getSig("vorinostat", "SKB", refdb=refdb)
#' @export
#' 
getSig <- function(cmp, cell, refdb){
    if(is.character(refdb)){
        refdb <- determine_refdb(refdb)
        refse <- SummarizedExperiment(HDF5Array(refdb, name="assay"))
        rownames(refse) <- HDF5Array(refdb, name="rownames")
        colnames(refse) <- HDF5Array(refdb, name="colnames")
        trt <- paste(cmp, cell, "trt_cp", sep="__")
        trt2 <- intersect(trt, colnames(refse))
        notin <- setdiff(trt, colnames(refse))
        if(length(notin) > 0){
            warning(length(notin), "/", length(trt), 
                    " teatments are not contained in refdb, ", 
                    "they are ignored!")
        }
        cmp_mat <- as.matrix(assay(refse[,trt2]))
        # sort decreasingly
        #cmp_mat2 <- apply(cmp_mat, 2, sort, decreasing=TRUE)
        return(cmp_mat)
    } else {
        message(paste("Please set refdb as one of",  
                "'lincs', 'lincs_expr', 'cmap' or 'cmap_expr', "),
                "or path to an HDF5 file representing reference database!")
    }
}

#' @rdname getSig
#' @param Nup integer(1). Number of most up-regulated genes to be subsetted
#' @param Ndown integer(1). Number of most down-regulated genes to be subsetted
#' @param higher numeric(1), the upper threshold of defining DEGs. 
#' At least one of 'lower' and 'higher' must be specified.
#' If \code{Nup} or \code{Ndown} arguments are defined, it will be ignored. 
#' @param lower numeric(1), the lower threshold of defining DEGs. 
#' At least one of 'lower' and 'higher' must be specified.
#' If \code{Nup} or \code{Ndown} arguments are defined, it will be ignored. 
#' @param padj numeric(1), cutoff of adjusted p-value or false discovery rate (FDR)
#' of defining DEGs if the reference HDF5 database contains the p-value matrix 
#' stored in the dataset named as 'padj'.
#' If \code{Nup} or \code{Ndown} arguments are defined, it will be ignored. 
#' @return a list of up- and down-regulated gene label sets 
#' @examples 
#' vor_degsig <- getDEGSig(cmp="vorinostat", cell="SKB", Nup=150, Ndown=150,
#'                         refdb=refdb)
#' @export
getDEGSig <- function(cmp, cell, Nup=NULL, Ndown=NULL, 
                      higher=NULL, lower=NULL, padj=NULL, refdb="lincs"){
    deprof <- suppressMessages(getSig(cmp, cell, refdb))
    deprof_sort <- apply(deprof, 2, sort, decreasing=TRUE)
    if(!is.null(Nup) & is.null(Ndown)){
        Ndown = 0
    }
    if(is.null(Nup) & !is.null(Ndown)){
        Nup = 0
    }
    if(! is.null(Nup) & ! is.null(Ndown)){
        upset <- head(rownames(deprof_sort), Nup)
        downset <- tail(rownames(deprof_sort), Ndown)
        return(list(upset=upset, downset=downset))
    } 
    if(!is.null(higher) & is.null(lower)){
        up <- rownames(deprof_sort[deprof_sort >= higher,,drop=FALSE])
        if(!is.null(padj)){
            pdegs <- getPCut(cmp, cell, refdb, padj)
            up <- up[up %in% pdegs]
        }
        return(list(upset=up, downset=character()))
    }
    if(is.null(higher) & !is.null(lower)){
        down <- rownames(deprof_sort[deprof_sort <= lower,,drop=FALSE])
        if(!is.null(padj)){
            pdegs <- getPCut(cmp, cell, refdb, padj)
            down <- down[down %in% pdegs]
        }
        return(list(upset=character(), downset=down))
    }
    if(!is.null(higher) & !is.null(lower)){
        up <- rownames(deprof_sort[deprof_sort >= higher,,drop=FALSE])
        down <- rownames(deprof_sort[deprof_sort <= lower,,drop=FALSE])
        if(!is.null(padj)){
            pdegs <- getPCut(cmp, cell, refdb, padj)
            up <- up[up %in% pdegs]
            down <- down[down %in% pdegs]
        }
        return(list(upset=up, downset=down))
    }
    stop("You need to either set 'Nup', 'Ndown' as integers or 
    set 'higher', 'lower', 'padj' as numeric values")
}

getPCut <- function(cmp, cell, refdb, padj){
    db_path <- determine_refdb(refdb)
    pmat <- HDF5Array(db_path, name="padj")
    rownames(pmat) <- as.character(HDF5Array(db_path, name="rownames"))
    colnames(pmat) <- as.character(HDF5Array(db_path, name="colnames"))
    pval <- as.matrix(pmat[, paste(cmp, cell, "trt_cp", sep="__"), drop=FALSE])
    degs <- rownames(pval[pval <= padj,,drop=FALSE])
    return(degs)
}
#' @rdname getSig
#' @return a numeric matrix with one column representing gene expression values
#' drawn from \code{lincs_expr} db of the most up- and down-regulated genes. 
#' The genes were subsetted according to z-scores drawn from \code{lincs} db. 
#' @examples 
#' all_expr <- as.matrix(runif(1000, 0, 10), ncol=1)
#' rownames(all_expr) <- paste0('g', sprintf("%04d", 1:1000))
#' colnames(all_expr) <- "drug__cell__trt_cp"
#' de_prof <- as.matrix(rnorm(1000, 0, 3), ncol=1)
#' rownames(de_prof) <- paste0('g', sprintf("%04d", 1:1000))
#' colnames(de_prof) <- "drug__cell__trt_cp"
#' ## getSPsubSig internally uses deprof2subexpr function
#' ## sub_expr <- deprof2subexpr(all_expr, de_prof, Nup=150, Ndown=150)
#' @export
getSPsubSig <- function(cmp, cell, Nup=150, Ndown=150){
    query_expr <- suppressMessages(getSig(cmp, cell, refdb="lincs_expr"))
    query_prof <- suppressMessages(getSig(cmp, cell, refdb="lincs"))
    sub_expr <- deprof2subexpr(query_expr, query_prof, Nup=Nup, Ndown=Ndown)
    return(sub_expr)
}

deprof2subexpr <- function(all_expr, de_prof, Nup=150, Ndown=150){
    de_prof_sort <- apply(de_prof, 2, sort, decreasing=TRUE)
    upset <- head(rownames(de_prof_sort), Nup)
    downset <- tail(rownames(de_prof_sort), Ndown)
    sub_expr <- all_expr[c(upset, downset), , drop=FALSE]
    return(sub_expr)
}
    
trts_check <- function(ref_trts, full_trts){
    trts_valid <- intersect(ref_trts, full_trts)
    inval <- setdiff(ref_trts, full_trts)
    if(length(inval)>0){
        message(length(inval), 
        " treatments in ref_trts are not available in refdb, they are ignored!")
    }
    if(length(trts_valid)==0){
        stop("No ref_trts are available in refdb, ", 
             "the refdb is subsetted to empty!")
    }
    return(trts_valid)
}

#' Reduce number of characters for each element of a character vector by 
#' replacting the part that beyond Nchar (e.g. 50) character to '...'.  
#' @title Reduce Number of Character 
#' @param vec character vector to be reduced
#' @param Nchar integer, for each element in the vec, number of characters to 
#' remain
#' @return character vector after reducing
#' @examples 
#' vec <- c(strrep('a', 60), strrep('b', 30))
#' vec2 <- vec_char_redu(vec, Nchar=50)
#' @export
vec_char_redu <- function(vec, Nchar=50){
    vec <- as.character(vec)
    res <- vapply(vec, function(s){
        if(nchar(s) > Nchar){
            s2 <- substr(s, 1, 50)
            paste0(s2, "...")
        } else s
    }, FUN.VALUE=character(1))
    return(res)
}

#' Show Reduced Targets
#' 
#' Reduce number of targets for each element of a character vector by 
#' replacting the targets that beyond Ntar to '...'.  
#' @param vec character vector, each element composed by a list of targets 
#' symbols separated by '; '
#' @param Ntar integer, for each element in the vec, number of targets to show
#' @return character vector after reducing
#' @examples 
#' vec <- c("t1; t2; t3; t4; t5; t6", "t7; t8")
#' vec2 <- tarReduce(vec, Ntar=5)
#' @export
#' 
tarReduce <- function(vec, Ntar=5){
    singleTarShot <- function(s, Ntar){
        temp <- strsplit(s, ';\\s?')[[1]]
        if(length(temp)>Ntar){
            return(paste0(paste(temp[seq_len(Ntar)], collapse = "; "), "; ..."))
        } else {
            return(s)
        }
    }
    return(sapply(vec, singleTarShot, Ntar, USE.NAMES = FALSE))
}

load_OrgDb <- function(OrgDb){
    if(is(OrgDb, "character")){
        if(! require(OrgDb, character.only = TRUE)){
            stop(paste(OrgDb, "package need to be installed to use this function"))
        }
        OrgDb <- eval(parse(text = OrgDb))
    }
    return(OrgDb)
}

#' Read in gene set information from .gmt files
#' 
#' This function reads in and parses information from the MSigDB's .gmt files. 
#' Pathway information will be returned as a list of gene sets.
#' 
#' The .gmt format is a tab-delimited list of gene sets, where each line is a 
#' separate gene set. The first column must specify the name of the gene set, 
#' and the second column is used for a short description (which this function 
#' discards). For complete details on the .gmt format, refer to the Broad 
#' Institute's Data Format's page 
#' \url{http://www.broadinstitute.org/cancer/software/gsea/wiki/index.php/Data_formats}.
#' 
#' @param file The .gmt file to be read
#' @param start integer(1), read the gmt file from start line
#' @param end integer(1), read the gmt file to the end line, the default -1 
#' means read to the end 
#' @return A list, where each index represents a separate gene set.
#' @section Warning:
#' The function does not check that the file is correctly formatted, and may 
#' return incorrect or partial gene sets, e.g. if the first two columns are 
#' omitted. Please make sure that files are correctly formatted before reading 
#' them in using this function.
#' @examples
#' gmt_path <- system.file("extdata/test_gene_sets_n4.gmt", package="signatureSearch")
#' geneSets <- read_gmt(gmt_path)
#' @export
#' 
read_gmt <- function(file, start=1, end=-1){
    if (!grepl("\\.gmt$", file)[1]) {
        stop("Pathway information must be a .gmt file")
    }
    geneSetDB = scan(file, what="character", n=end-start+1, skip=start-1, 
                     sep="\n", quiet=TRUE)
    geneSetDB = suppressWarnings(strsplit(geneSetDB, "\t"))
    names(geneSetDB) = sapply(geneSetDB, "[", 1)
    geneSetDB = lapply(geneSetDB, "[", -1:-2)
    geneSetDB = lapply(geneSetDB, function(x) {
        x[which(x != "")]
    })
    geneSetDB <- geneSetDB[sapply(geneSetDB, length)>0 & !is.na(names(geneSetDB))]
    return(geneSetDB)
}

##' Mapping 'itemID' column in the FEA enrichment result table from Entrez ID
##' to gene Symbol
##'
##' @title Set Readable
##' @param tb tibble object, enrichment result table
##' @param OrgDb character(1), 'org.Hs.eg.db' for human
##' @param keyType character(1), keyType of gene
##' @param geneCol character(1), name of the column in 'tb' containing gene
##' Entrez ids separated by '/' to be converted to gene Symbol
##' @return tibble Object
##' @importFrom stats setNames
##' @importFrom AnnotationDbi columns
##' @examples 
##' data(drugs10)
##' res <- tsea_dup_hyperG(drugs=drugs10, type="Reactome", pvalueCutoff=1, 
##'                        qvalueCutoff=1)
##' res_tb <- set_readable(result(res))
##' @export
set_readable <- function(tb, OrgDb="org.Hs.eg.db", keyType="ENTREZID", geneCol="itemID"){
    OrgDb <- load_OrgDb(OrgDb)
    if (!'SYMBOL' %in% columns(OrgDb)) {
        warning("Fail to convert input geneID to SYMBOL since no SYMBOL information available in the provided OrgDb...")
    }
    
    if(! geneCol %in% colnames(tb)){
        stop("The column name defined under 'geneCol' must be contained in tb!")
    }
    gc <- setNames(strsplit(tb[[geneCol]], "/", fixed=TRUE), tb$ID)
    genes <- unique(unlist(gc))
    gn <- EXTID2NAME(OrgDb, genes, keyType)
    gc <- lapply(gc, function(i) gn[i])
    geneID <- sapply(gc, paste0, collapse="/")
    
    # gc_sym <- sapply(gc, function(genes){
    #     gn_df <- suppressMessages(select(OrgDb, keys=genes, keytype=keyType, 
    #                                      columns="SYMBOL"))
    #     gn_sym <- paste0(na.omit(gn_df$SYMBOL), collapse="/")
    #     return(gn_sym)})
    
    tb[[geneCol]] <- geneID
    return(tb)
}

#' Add PCID to drug data frame
#' 
#' This function can be used to add the \code{PCIDss} (PubChem CID column added
#' from signatureSearch package) column to a data frame that have a column 
#' store compound names. The compound name to PubChem CID annotation is obtained
#' from \code{lincs_pert_info} in 2017.
#' 
#' @param df data frame or tibble object
#' @param drug_col name of the column that store compound names in df
#' @return tibble object with an added PCIDss column
#' @importFrom dplyr relocate
#' @examples 
#' data("lincs_pert_info")
#' # gess_tb2 <- add_pcid(gess_tb)
#' @export
add_pcid <- function(df, drug_col="pert"){
    data("lincs_pert_info", envir=environment())
    pert2 <- lincs_pert_info[, c("pert_iname", "pubchem_cid")]
    colnames(pert2) <- c("pert_iname", "PCIDss")
    join_cols = "pert_iname"
    names(join_cols) <- drug_col
    df %<>% left_join(pert2, by=join_cols)
    return(df)
}

#' Add MOA annotation to drug data frame
#' 
#' The MOA annotation is a list of MOA name to drug name mappings. 
#' This functions add the MOA column to data frame when data frame have a 
#' column with compound names
#' 
#' @param df data frame that must contains a column with compound names
#' @param drug_col character (1), name of the column that stores compound names
#' @param moa_list a list object of MOA name (e.g. HDAC inhibitor) to compound
#' name mappings
#' @return data frame with an added MOAss column  
#' @examples 
#' data("clue_moa_list")
#' df <- data.frame(pert=c("vorinostat", "sirolimus"), annot1=c("a", "b"), 
#'                  annot2=1:2)
#' addMOA(df, "pert", clue_moa_list) 
#' @export
addMOA <- function(df, drug_col, moa_list){
    drug2moa <- list_rev(moa_list)
    d2m_vec <- sapply(drug2moa, paste, collapse="; ")
    df$MOAss <- d2m_vec[df[[drug_col]]]
    return(df)
}

#' Add Compound Annotation Info to GESS Result Table
#' 
#' This function supports adding customized compound annotation table to the 
#' GESS result table if provided. It then automatically adds the target gene
#' symbols, MOAs and PubChem CID columns (\code{t_gn_sym}, \code{MOAss}, 
#' \code{PCIDss}) if the table contains a column that stores compound names.
#'  
#' @param gess_tb tibble or data.frame object of GESS result, 
#' can be accessed by the \code{result} method on the \code{gessResult} object 
#' from \code{gess_*} functions. Or a customized data frame that contains a 
#' \code{pert} column that stores compound id or name.
#' @param refdb character(1), reference database that can be accessed by 
#' the \code{refdb} method on the \code{gessResult} object. If \code{gess_tb}
#' is a customized table, \code{refdb} can be just set as 'custom'.
#' @param cmp_annot_tb data.frame or tibble of compound annotation table. 
#' This table contains annotation information for compounds stored under
#' \code{pert} column of \code{gess_tb}. Set to \code{NULL} if not available.
#' This table should not contain columns with names of "t_gn_sym", "MOAss" or 
#' "PCIDss", these three columns will be added internally and thus conserved by 
#' the function. If they are contained in \code{cmp_annot_tb}, they will be
#' overwritten. If users want to maintain these three columns in the provided 
#' annotation table, give them different names.
#' @param by character(1), column name in \code{cmp_annot_tb} that can be merged
#' with \code{pert} column in \code{gess_tb}. If \code{refdb} is set 
#' as 'lincs2', it will be merged with \code{pert_id} column in the GESS result 
#' table. If \code{cmp_annot_tb} is NULL, \code{by} is ignored.
#' @param cmp_name_col character(1), column name in \code{gess_tb} or 
#' \code{cmp_annot_tb} that store compound names. If there is no compound name 
#' column, set to \code{NULL}. If \code{cmp_name_col} is available, 
#' three additional columns (t_gn_sym, MOAss, PCIDss) are automatically added 
#' by using \code{\link{get_targets}}, CLUE touchstone compound MOA annotation, 
#' and 2017 \code{lincs_pert_info} annotation table, respectively as 
#' annotation sources. t_gn_sym: target gene symbol, MOAss: MOA annotated from 
#' signatureSearch, PCIDss: PubChem CID annotated from signatureSearch.
#' @return tibble of \code{gess_tb} with target, MOA, PubChem CID annotations
#' and also merged with user provided compound annotation table. 
#' @importFrom dplyr tibble
#' @examples 
#' gess_tb <- data.frame(pert=c("vorinostat", "sirolimus", "estradiol"),
#'                   cell=c("SKB", "SKB", "MCF7"),
#'                   NCS=runif(3))
#' cmp_annot_tb <- data.frame(pert_name=c("vorinostat", "sirolimus", "estradiol"),
#'                            prop1=c("a", "b", "c"),
#'                            prop2=1:3)
#' addGESSannot(gess_tb, "custom", cmp_annot_tb, by="pert_name", 
#'              cmp_name_col="pert")
#' @export
addGESSannot <- function(gess_tb, refdb, cmp_annot_tb=NULL, by="pert", 
                         cmp_name_col="pert"){
    names(by) <- "pert"
    lastcol <- colnames(gess_tb)[ncol(gess_tb)]
    if(refdb == "lincs2"){
        # rename 'pert' as 'pert_id'; add 'pert' column
        data("lincs_pert_info2", envir=environment())
        gess_tb %<>% left_join(lincs_pert_info2[,c("pert_id", "pert_iname")],
                               by=c("pert"="pert_id")) %>% 
            relocate("pert_iname", .after="pert") %>% 
            dplyr::rename(c("pert_id"="pert", "pert"="pert_iname"))
        names(by) <- "pert_id"
    }
    if(!is.null(cmp_annot_tb)){
        if(! by %in% colnames(cmp_annot_tb)){
            warning("'by' argument needs to be a column name that is in 'cmp_annot_tb' so that 
  this column can be merged with 'pert' column in GESS results. If the refdb is 'lincs2', 
  it will be merged with the 'pert_id' column. Now, the compound annotation table will not 
  be added to GESS result table.")
        } else {
            # check whether t_gn_sym, MOAss, PCIDss columns in cmp_annot_tb
            conf_col <- intersect(c("t_gn_sym", "MOAss", "PCIDss"), colnames(cmp_annot_tb))
            if(length(conf_col) > 0){
                message(paste(conf_col, collapse=", "), 
                    " columns exists in cmp_annot_tb, they will be replaced",
                    " by the internally generated columns, if users want to",
                    " keep those columns, please rename them.")
            }
            cmp_annot_tb %<>% dplyr::select(-all_of(conf_col))
            res <- left_join(tibble(gess_tb), cmp_annot_tb, by=by)
        }
    } else {
        res <- gess_tb
    }
    
    if(is.null(cmp_name_col)){
        return(tibble(res)) 
    } else if(! cmp_name_col %in% colnames(res)){
                cmp_name_col <- "pert"
    }
    
    # Add t_gn_sym
    target <- suppressMessages(get_targets(res[[cmp_name_col]]))
    join_cols <- "drug_name"; names(join_cols) <- cmp_name_col
    res %<>% left_join(target, by=join_cols)
    # Add MOAss
    data("clue_moa_list", envir=environment())
    res <- addMOA(res, cmp_name_col, clue_moa_list)
    # Add PCIDss
    res <- add_pcid(res, cmp_name_col)
    res %<>% relocate(t_gn_sym:PCIDss, .after=all_of(lastcol))
    return(tibble(res))
}

#' Named list to data frame 
#' 
#' Convert a list with names that have one to many mapping relationships to
#' a data.frame of two columns, one column is names, the other column is the
#' unlist elements
#' 
#' @param list input list with names slot
#' @param colnames character vector of length 2, indicating the column names
#' of the returned data.frame
#' @return data.frame
#' @examples 
#' list <- list("n1"=c("e1", "e2", "e4"), "n2"=c("e3", "e5"))
#' list2df(list, colnames=c("name", "element"))
#' @export
list2df <- function(list, colnames){
    df_list <- lapply(names(list), function(x){
        return(data.frame(x=x, y=list[[x]]))
    })
    df <- do.call(rbind, df_list)
    colnames(df) <- colnames
    return(unique(df))
}

#' Reverse list 
#' 
#' Reverse list from list names to elements mapping to elements to names mapping.
#' 
#' @param list input list with names slot
#' @return list
#' @examples 
#' list <- list("n1"=c("e1", "e2", "e4"), "n2"=c("e1", "e5"))
#' list_rev(list)
#' @export
list_rev <- function(list){
    df <- list2df(list, colnames=c("name", "ele"))
    listr <- split(df$name, df$ele)
    listr2 <- sapply(listr, unique, simplify=FALSE)
    return(listr2)
}

## get rid of "Undefined global functions or variables" note
globalVariables(c("PCID", "pubchem_cid", "lincs_pert_info",
                  "PCIDss", "all_of", "lincs_pert_info2", "t_gn_sym")) 
yduan004/signatureSearch documentation built on Feb. 19, 2024, 9:30 a.m.