R/vis.R

Defines functions hpaVisTissue

Documented in hpaVisTissue

###########################
## Visualize tissue data ##
###########################

#' Visualize tissue data
#'
#' Visualize the expression of protein of interest in each target tissue by cell
#' types.
#'
#' @param data Input the list object generated by \code{hpa_download()} or
#'   \code{hpa_subset()}. Require the \code{normal_tissue} dataset. Use HPA
#'   histology data (built-in) by default.
#' @param targetGene Vector of strings of HGNC gene symbols. By default it is
#'   set to \code{c('TP53', 'EGFR', 'CD44', 'PTEN')}. You can also mix
#'   HGNC gene symbols and ensemnl ids (start with ENSG) and they will be
#'   converted to HGNC gene symbols.
#' @param targetTissue Vector of strings of normal tissues. Default to all.
#' @param targetCellType Vector of strings of normal cell types. Default to all.
#' @param color Vector of 4 colors used to depict different expression levels.
#' @param customTheme Logical argument. If \code{TRUE}, the function will return
#'   a barebone ggplot2 plot to be customized further.
#'
#' @return This function will return a ggplot2 plot object, which can be further
#'   modified if desirable. The tissue data is visualized as a heatmap: x axis
#'   contains inquired protein and y axis contains tissue/cells of interest.
#'
#' @family visualization functions
#'
#' @examples
#'   data("hpa_histology_data")
#'   geneList <- c('TP53', 'EGFR', 'CD44', 'PTEN', 'IDH1', 'IDH2', 'CYCS')
#'   tissueList <- c('breast', 'cerebellum', 'skin 1')
#'
#'   ## A typical function call
#'   hpaVisTissue(data=hpa_histology_data,
#'                targetGene=geneList,
#'                targetTissue=tissueList)
#'
#' @import dplyr
#' @import ggplot2
#' @export

hpaVisTissue <- function(data = NULL,
                         targetGene = NULL,
                         targetTissue = NULL,
                         targetCellType = NULL,
                         color = c('#FCFDBF', '#FE9F6D', '#DE4968', '#8C2981'),
                         customTheme = FALSE) {

    # Check if parameters are provided or not
    data <- is_null_data(data = data)
    
    target_check <- is_null_target(gene = targetGene,
                                   tissue = targetTissue,
                                   celltype = targetCellType)
    
    targetGene <- target_check$targetGene
    # targetTissue <- target_check$targetTissue
    
    # Create plot dataset with target parameters
    plotData <- data$normal_tissue %>%
        filter(gene %in% targetGene) %>%
        {if (!is.null(targetTissue)) filter(., tissue %in% targetTissue) else .} %>%
        {if (!is.null(targetCellType)) filter(., cell_type %in% targetCellType) else .}
    
    plotData <- mutate(
        plotData,
        tissue_cell = paste0(tissue, ' / ', cell_type),
        level = factor(level,
                       levels = c('High', 'Medium', 'Low', 'Not detected'))
    )

    levelColors <- c(
        'Not detected' = color[1],
        'Low' = color[2],
        'Medium' = color[3],
        'High' = color[4]
    )

    plot <- ggplot(plotData, aes(x = gene, y = tissue_cell)) +
        geom_tile(aes(fill = level)) +
        scale_x_discrete(limits = targetGene) +
        scale_fill_manual(values = levelColors)

    if (!customTheme) {
        plot <- plot +
            ylab('Tissue / Cell') +
            xlab('Genes') +
            theme_minimal() +
            theme(panel.grid = element_blank()) +
            theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
            coord_equal()
    }

    return(plot)
}



##############################
## Visualize pathology data ##
##############################

#' Visualize pathology data
#'
#' Visualize the expression of genes of interest in each cancer.
#'
#' @param data Input the list object generated by \code{hpa_download()} or
#'   \code{hpa_subset()}. Require the \code{pathology} dataset. Use HPA
#'   histology data (built-in) by default.
#' @param targetGene Vector of strings of HGNC gene symbols. By default it is
#'   set to \code{c('TP53', 'EGFR', 'CD44', 'PTEN')}. You can also mix HGNC gene
#'   symbols and ensemnl ids (start with ENSG) and they will be converted to
#'   HGNC gene symbols.
#' @param targetCancer Vector of strings of normal tissues. The function will
#'   plot all available cancer by default.
#' @param facetBy Determine how multiple graphs would be faceted. Either
#'   \code{cancer} (default) or \code{gene}.
#' @param color Vector of 4 colors used to depict different expression levels.
#' @param customTheme Logical argument. If \code{TRUE}, the function will return
#'   a barebone ggplot2 plot to be customized further.
#'
#' @return This function will return a ggplot2 plot object, which can be further
#'   modified if desirable. The pathology data is visualized as multiple bar
#'   graphs, one for each type of cancer. For each bar graph, x axis contains
#'   the inquired protein and y axis contains the proportion of patients.
#'
#' @family visualization functions
#'
#' @examples
#'   data("hpa_histology_data")
#'   geneList <- c('TP53', 'EGFR', 'CD44', 'PTEN', 'IDH1', 'IDH2', 'CYCS')
#'   cancerList <- c('breast cancer', 'glioma', 'melanoma')
#'
#'   ## A typical function call
#'   hpaVisPatho(data=hpa_histology_data,
#'                  targetGene=geneList)
#'
#' @import dplyr
#' @import ggplot2
#' @importFrom stats reshape
#' @export

hpaVisPatho <- function(data = NULL,
                        targetGene = NULL,
                        targetCancer = NULL,
                        facetBy = "cancer",
                        color = c('#FCFDBF', '#FE9F6D', '#DE4968', '#8C2981'),
                        customTheme = FALSE) {
    # Check if parameters are provided or not
    data <- is_null_data(data = data)
    target_check <- is_null_target(gene = targetGene,
                                   cancer = targetCancer)
    targetGene <- target_check$targetGene
    
    # Prepare data for plotting
    
    plotData <- data$pathology %>%
        filter(gene %in% targetGene)
    
    if (!is.null(targetCancer)) {
        plotData <- filter(plotData, cancer %in% targetCancer)
    }
    
    plotData <- plotData %>%
        select(gene, cancer, high, medium, low, not_detected) %>%
        rename(
            'High' = 'high',
            'Medium' = 'medium',
            'Low' = 'low',
            'Not detected' = 'not_detected'
        ) %>%
        ## The old way used tidyr::gather
        # gather(key = "level", value = "patient_count", -gene, -cancer)
        ## The new way uses stats::reshape
        as.data.frame() %>%
        reshape(
            direction = "long",
            varying = list(3:6),
            v.names = "patient_count",
            timevar = "level",
            times = c("High", "Medium", "Low", "Not detected")
        )
    
    #re-level
    plotData$level <- factor(plotData$level,
                             levels = c("High", "Medium", "Low", "Not detected"))
    
    levelColors <- c(
        'Not detected' = color[1],
        'Low' = color[2],
        'Medium' = color[3],
        'High' = color[4]
    )
    
    plotBy <- if(facetBy == "cancer") "gene" else "cancer"
    
    plot <-
        ggplot(plotData, aes(x = .data[[plotBy]], y = patient_count, fill = level)) +
        geom_bar(stat = 'identity', position = 'fill') +
        scale_x_discrete(limits = if(facetBy == "cancer") targetGene else targetCancer) +
        scale_fill_manual(values = levelColors) +
        facet_wrap( ~ .data[[facetBy]])
    
    if (!customTheme) {
        plot <- plot +
            ylab('Patient proportions') +
            xlab(if(facetBy == "cancer") "Genes" else "Cancers") +
            theme_minimal() +
            theme(panel.grid = element_blank()) +
            theme(axis.text.x = element_text(angle = 90, hjust = 1))
    }
    
    return(plot)
}


############################
## Visualize subcell data ##
############################

#' Visualize subcellular location data
#'
#' Visualize the the confirmed subcellular locations of genes of interest.
#'
#' @param data Input the list object generated by \code{hpa_download()} or
#'   \code{hpa_subset()}. Require the \code{subcellular_location} dataset. Use
#'   HPA histology data (built-in) by default.
#' @param targetGene Vector of strings of HGNC gene symbols. By default it is
#'   set to \code{c('TP53', 'EGFR', 'CD44', 'PTEN')}. You can also mix
#'   HGNC gene symbols and ensemnl ids (start with ENSG) and they will be
#'   converted to HGNC gene symbols.
#' @param reliability Vector of string indicate which reliability scores you want to plot. The
#'   default is everything \code{c("enhanced", "supported", "approved",
#'   "uncertain")}.
#' @param color Vector of 2 colors used to depict if the protein expresses in a
#'   location or not.
#' @param customTheme Logical argument. If \code{TRUE}, the function will return
#'   a barebone ggplot2 plot to be customized further.
#'
#' @return This function will return a ggplot2 plot object, which can be further
#'   modified if desirable. The subcellular location data is visualized as a
#'   tile graph, in which the x axis includes the inquired proteins and the y
#'   axis contain the subcellular locations.
#'
#' @family visualization functions
#'
#' @examples
#'   data("hpa_histology_data")
#'   geneList <- c('TP53', 'EGFR', 'CD44', 'PTEN', 'IDH1', 'IDH2', 'CYCS')
#'
#'   ## A typical function call
#'   hpaVisSubcell(data=hpa_histology_data,
#'                   targetGene=geneList)
#'
#' @import dplyr
#' @import ggplot2
#' @importFrom tibble as_tibble
#' @export

hpaVisSubcell <- function(data = NULL,
                          targetGene = NULL,
                          reliability = c("enhanced", "supported", "approved", "uncertain"),
                          color = c('#FCFDBF', '#8C2981'),
                          customTheme = FALSE) {
    # Check if parameters are provided or not
    data <- is_null_data(data = data)
    target_check <- is_null_target(gene = targetGene)
    targetGene <- target_check$targetGene
    
    # Prepare data for plotting
    plotData <- data$subcellular_location %>%
        filter(gene %in% targetGene) %>%
        mutate(sub_location = NA)
    
    # if ("enhanced" %in% reliability)
    #     plotData <- mutate(plotData,
    #                        sub_location =  paste(sub_location, enhanced, sep = ";"))
    # if ("supported" %in% reliability)
    #     plotData <- mutate(plotData,
    #                        sub_location =  paste(sub_location, supported, sep = ";"))
    # if ("approved" %in% reliability)
    #     plotData <- mutate(plotData,
    #                        sub_location =  paste(sub_location, approved, sep = ";"))
    # if ("uncertain" %in% reliability)
    #     plotData <- mutate(plotData,
    #                        sub_location =  paste(sub_location, uncertain, sep = ";"))
    
    for (i in reliability) {
        plotData <- mutate(plotData, 
                           sub_location = paste(sub_location, .data[[i]], sep = ";"))
    }
    
    # plotData <-  plotData %>%
    #     mutate(sub_location=strsplit(sub_location, ';')) %>%
    #     tidyr::unnest(sub_location) %>%
    #     select(sub_location, gene) %>%
    #     filter(sub_location != "NA") %>%
    #     table() %>%
    #     as_tibble() %>%
    #     mutate(n=factor(n, levels=c('0', '1')))
    
    ## Use apply(as_tibble) %>% bind_rows instead of unnest
    plotData <-  plotData %>%
        mutate(sub_location = strsplit(sub_location, ';')) %>%
        apply(MARGIN = 1, FUN = as_tibble) %>% bind_rows() %>%
        select(sub_location, gene) %>%
        filter(sub_location != "NA") %>%
        table() %>%
        as_tibble() %>%
        mutate(n = factor(n, levels = c('0', '1')))
    
    levelColors <- c('0' = color[1],
                     '1' = color[length(color)])
    
    plot <- ggplot(plotData, aes(x = gene, y = sub_location)) +
        geom_tile(aes(fill = n), colour = "grey50") +
        scale_x_discrete(limits = targetGene) +
        scale_fill_manual(
            values = levelColors,
            name = "Detected",
            breaks = c(0, 1),
            labels = c("No", "Yes")
        )
    
    if (!customTheme) {
        plot <- plot +
            ylab('Subcellular locations') +
            xlab('Genes') +
            theme_minimal() +
            theme(panel.grid = element_blank()) +
            theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
            coord_equal()
    }
    
    return(plot)
}

## hpaVis shared child functions================================================
is_null_target <- function(gene = "",
                           tissue = "",
                           celltype = "",
                           cancer = "") {
    out <- list()
    infoDisp <- FALSE
    
    if (is.null(gene)) {
        message('* WARNING: targetGene variable not specified, use example (TP53, EGFR, CD44, PTEN).')
        out$targetGene <- c('TP53', 'EGFR', 'CD44', 'PTEN')
        infoDisp <- TRUE
    } else out$targetGene <- gene_ensembl_convert(gene, "gene")
    
    # Check if targetTissue is provided
    if (is.null(tissue)) {
        message('* WARNING: targetTissue variable not specified, visualize all.')
        infoDisp <- TRUE
    }
    
    # Check if targetCellType is provided
    if (is.null(celltype)) {
        message('* WARNING: targetCellType variable not specified, visualize all.')
        infoDisp <- TRUE
    }
    
    # Check if targetCellType is provided
    if (is.null(cancer)) {
        message('* WARNING: targetCancer variable not specified, visualize all.')
        infoDisp <- TRUE
    }
    
    # Show a message if any parameter is not defined
    if (infoDisp) {
        message('>> Use hpaListParam() to list possible values for target variables.')
    }
    
    return(out)
}
anhtr/HPAanalyze documentation built on June 3, 2023, 2:19 a.m.