Nothing
#######################
## Download datasets ##
#######################
#' Download datasets
#'
#' Download the latest version of HPA datasets and import them in R. It is
#' recommended to only download the datasets you need, as some of them may be
#' very big.
#'
#' @param downloadList A vector or string indicate which datasets to download.
#' Possible value:
#' \itemize{
#' \item \code{'Normal tissue'}
#' \item \code{'Pathology'}
#' \item \code{'Subcellular location'}
#' \item \code{'RNA tissue'}
#' \item \code{'RNA cell line'}
#' \item \code{'RNA transcript tissue'}
#' \item \code{'RNA transcript cell line'}
#' \item \code{'all'}: download everything
#' \item \code{'histology'}: same as \code{c('Normal tissue', 'Pathology',
#' 'Subcellular location')}
#' \item \code{'rna'}: same as \code{c('RNA tissue', 'RNA cell line')}
#' \item \code{'isoform'}: same as \code{c('RNA transcript tissue', 'RNA
#' transcript cell line')}
#' }
#' See \url{https://www.proteinatlas.org/about/download} for more information.
#'
#' @param version A string indicate which version to be downloaded. Possible
#' value:
#' \itemize{
# \item \code{'hpar'}: Load data from the 'hpar' package. Does not contain
#isoform data.
#' \item \code{'latest'}: Download latest version. Require Internet
#' connection.
#' \item \code{'example'}: Load the example dataset from 'HPA
#' analyze' ('hpa_histology_data'). Does not contain rna or
#' isoform data.
#' \item \code{'v?'} with '?' is a integer: Download a specific version of
#' the dataset. For example: 'v18' download version 18. Currently support
#' version 17 and above. Require Internet connection.
#' }
#'
#' @family downloadable datasets functions
#'
#' @return This function will return a list of tibbles corresponding to
#' requested datasets.
#'
#' @seealso
#' \code{\link{hpaDownload}}
#' \code{\link{hpa_histology_data}}
#'
#' @examples
#' downloadedData <- hpaDownload(downloadList='all', version='example')
#' summary(downloadedData)
#'
#' @import dplyr
# @import hpar
#' @importFrom utils download.file data read.delim2
#' @importFrom stats reshape
#' @importFrom tibble as_tibble
#' @export
#'
hpaDownload <- function(downloadList='histology', version='latest') {
## generate a list of item to download
if(downloadList == 'all') {
downloadList <- c('Normal tissue',
'Pathology',
'Subcellular location',
'RNA tissue',
'RNA cell line',
'RNA transcript tissue',
'RNA transcript cell line')
} else if(downloadList == 'histology') {
downloadList <- c('Normal tissue',
'Pathology',
'Subcellular location')
} else if(downloadList == 'rna') {
downloadList <- c('RNA tissue',
'RNA cell line')
} else if(downloadList == 'isoform') {
downloadList <- c('RNA transcript tissue',
'RNA transcript cell line')
}
#initiate the list to be returned
loadedData <- list()
#create list of colnames
normalTissueColnames <- c('ensembl', 'gene', 'tissue',
'cell_type', 'level', 'reliability')
pathologyColnames <- c('ensembl', 'gene', 'cancer', 'high',
'medium', 'low', 'not_detected',
'prognostic_favorable',
'unprognostic_favorable',
'prognostic_unfavorable',
'unprognostic_unfavorable')
pathologyColnamesHpar <- c('ensembl', 'gene', 'cancer', 'high',
'low', 'medium', 'not_detected')
subcellularLocationColnames_legacy <- c('ensembl', 'gene', 'reliability',
'enhanced', 'supported', 'approved',
'uncertain', 'single_cell_var_intensity',
'single_cell_var_spatial',
'cell_cycle_dependency', 'go_id')
subcellularLocationColnames <- c('ensembl', 'gene', 'reliability',
"main_location", "additional_location",
"extracellular_location",
'enhanced', 'supported', 'approved',
'uncertain', 'single_cell_var_intensity',
'single_cell_var_spatial',
'cell_cycle_dependency', 'go_id')
rnaTissueColnames <- c('ensembl', 'gene', 'tissue', 'nx')
rnaCellLineColnames <- c('ensembl', 'gene', 'cell_line',
'tpm', 'ptpm', "nx")
transcriptRnaTissueColnames <- c('ensembl', 'transcript',
'tissue', 'tpm')
transcriptRnaCellLineColnames <- c('ensembl', 'transcript',
'cell_line', 'tpm')
# if (version == 'hpar') {# load 'hpar' data
# if(!('package:hpar' %in% search())) {
# attachNamespace('hpar')
# # stop('Please load "hpar" package')
# }
#
# if('Normal tissue' %in% downloadList) {# load 'normal_tissue'
# data('hpaNormalTissue',
# envir=environment())
# normal_tissue <- as_tibble(hpaNormalTissue)
# colnames(normal_tissue) <- normalTissueColnames
# loadedData$normal_tissue <- normal_tissue
# }
#
# if('Pathology' %in% downloadList) {# load `pathology`
# data('hpaCancer',
# envir=environment())
# pathology <- as_tibble(hpaCancer) %>%
# select(-Total.patients) %>%
# spread(Level, Count.patients)
# colnames(pathology) <- pathologyColnamesHpar
# loadedData$pathology <- pathology
# }
#
# if('Subcellular location' %in% downloadList) {# load 'subcellular_location'
# data('hpaSubcellularLoc',
# envir=environment())
# subcellular_location <- as_tibble(hpaSubcellularLoc)
# colnames(subcellular_location) <- subcellularLocationColnames
# subcellular_location$gene <- as.character(subcellular_location$gene)
# subcellular_location$go_id <- as.character(subcellular_location$go_id)
# loadedData$subcellular_location <- subcellular_location
# }
#
# if('RNA tissue' %in% downloadList) {# load 'rna_tissue'
# data('rnaGeneTissue',
# envir=environment())
# rna_tissue <- as_tibble(rnaGeneTissue)
# colnames(rna_tissue) <- rnaTissueColnames
# loadedData$rna_tissue <- rna_tissue
# }
#
# if('RNA cell line' %in% downloadList) { # load 'rna_cell_line'
# data('rnaGeneCellLine',
# envir=environment())
# rna_cell_line <- as_tibble(rnaGeneCellLine)
# colnames(rna_cell_line) <- rnaCellLineColnames
# loadedData$rna_cell_line <- rna_cell_line
# }
# } else if (version == 'example') {# load example data
if (version == 'example') {# load example data
data('hpa_histology_data',
package='HPAanalyze',
envir=environment())
if('Normal tissue' %in% downloadList) {# load 'normal_tissue'
loadedData$normal_tissue <- HPAanalyze::hpa_histology_data$normal_tissue
}
if('Pathology' %in% downloadList) {# load `pathology`
loadedData$pathology <- HPAanalyze::hpa_histology_data$pathology
}
if('Subcellular location' %in% downloadList) {# load 'subcellular_location'
loadedData$subcellular_location <- HPAanalyze::hpa_histology_data$subcellular_location
}
} else {# download data from the internet
# generate a vector of urls for download
downloadUrls <- version_to_download_urls(version)
# Check if the term is requested or not (all by default)
# then download the file
# then unzip and make a tibble out of data
# then add the tibble to the list
if('Normal tissue' %in% downloadList) {
temp <- tempfile()
download.file(url=downloadUrls['normal_tissue'],
destfile=temp)
# normal_tissue <- read_tsv(unz(temp, 'normal_tissue.tsv'))
normal_tissue <-
read.delim2(
unz(temp, 'normal_tissue.tsv'),
stringsAsFactors = FALSE,
check.names = FALSE,
strip.white = TRUE,
sep="\t",
na.strings = c("", " ")
) %>% as_tibble()
unlink(temp)
colnames(normal_tissue) <- normalTissueColnames
loadedData$normal_tissue <- normal_tissue
}
if('Pathology' %in% downloadList) {
temp <- tempfile()
download.file(url=downloadUrls['pathology'],
destfile=temp)
# pathology <- read_tsv(unz(temp, 'pathology.tsv'))
pathology <-
read.delim2(
unz(temp, 'pathology.tsv'),
stringsAsFactors = FALSE,
check.names = FALSE,
strip.white = TRUE,
sep="\t",
na.strings = c("", " ")
) %>% as_tibble()
unlink(temp)
colnames(pathology) <- pathologyColnames
# use correct column type
pathology <- pathology %>%
mutate_at(c('prognostic_favorable',
'unprognostic_favorable',
'prognostic_unfavorable',
'unprognostic_unfavorable'), as.numeric)
loadedData$pathology <- pathology
}
if('Subcellular location' %in% downloadList) {
temp <- tempfile()
download.file(url=downloadUrls['subcellular_location'],
destfile=temp)
# subcellular_location <- read_tsv(unz(temp, 'subcellular_location.tsv'))
subcellular_location <-
read.delim2(
unz(temp, 'subcellular_location.tsv'),
stringsAsFactors = FALSE,
check.names = FALSE,
strip.white = TRUE,
sep="\t",
na.strings = c("", " ")
) %>% as_tibble()
unlink(temp)
if (ncol(subcellular_location) == 14) {
colnames(subcellular_location) <- subcellularLocationColnames
} else if (ncol(subcellular_location) == 11) {
colnames(subcellular_location) <- subcellularLocationColnames_legacy
}
loadedData$subcellular_location <- subcellular_location
}
if('RNA tissue' %in% downloadList) {
message("NOTE: HPAanalyze does not previous version of RNA tissue data.
Download the 'RNA consensus tissue gene data' of the current version.")
temp <- tempfile()
download.file(url="https://www.proteinatlas.org/download/rna_tissue_consensus.tsv.zip",
destfile=temp)
# rna_tissue <- read_tsv(unz(temp, 'rna_tissue.tsv'))
rna_tissue <-
read.delim2(
unz(temp, 'rna_consensus.tsv'),
stringsAsFactors = FALSE,
check.names = FALSE,
strip.white = TRUE,
sep="\t",
na.strings = c("", " ")
) %>% as_tibble()
unlink(temp)
colnames(rna_tissue) <- rnaTissueColnames
loadedData$rna_tissue <- rna_tissue
}
if('RNA cell line' %in% downloadList) {
message("NOTE: HPAanalyze does not previous version of RNA cell line data.
Download the 'RNA HPA cell line gene data' of the current version.")
temp <- tempfile()
download.file(url="https://www.proteinatlas.org/download/rna_celline.tsv.zip",
destfile=temp)
# rna_cell_line <- read_tsv(unz(temp, 'rna_celline.tsv'))
rna_cell_line <-
read.delim2(
unz(temp, 'rna_celline.tsv'),
stringsAsFactors = FALSE,
check.names = FALSE,
strip.white = TRUE,
sep="\t",
na.strings = c("", " ")
) %>% as_tibble()
unlink(temp)
colnames(rna_cell_line) <- rnaCellLineColnames
loadedData$rna_cell_line <- rna_cell_line
}
if('RNA transcript tissue' %in% downloadList) {
temp <- tempfile()
download.file(url=downloadUrls['transcript_rna_tissue'],
destfile=temp)
# transcript_rna_tissue <- read_tsv(unz(temp, 'transcript_rna_tissue.tsv'))
transcript_rna_tissue <-
read.delim2(
unz(temp, 'transcript_rna_tissue.tsv'),
stringsAsFactors = FALSE,
check.names = FALSE,
strip.white = TRUE,
sep="\t",
na.strings = c("", " ")
) %>% as_tibble()
unlink(temp)
## Old version used tidyr::gather
# transcript_rna_tissue <- tidyr::gather(transcript_rna_tissue,
# key='tissue',
# value='value',
# -'ensgid', -'enstid')
## New version use stats::reshape
transcript_rna_tissue <-
stats::reshape(
transcript_rna_tissue,
direction = "long",
varying = list(3:ncol(transcript_rna_tissue)),
v.names = "value",
timevar = "tissue",
times = c(colnames(transcript_rna_tissue[, 3:ncol(transcript_rna_tissue)]))
) %>%
subset(select = -id)
colnames(transcript_rna_tissue) <- transcriptRnaTissueColnames
loadedData$transcript_rna_tissue <- transcript_rna_tissue
}
if('RNA transcript cell line' %in% downloadList) {
temp <- tempfile()
download.file(url=downloadUrls['transcript_rna_cell_line'],
destfile=temp)
# transcript_rna_cell_line <- read_tsv(unz(temp, 'transcript_rna_celline.tsv'))
transcript_rna_cell_line <-
read.delim2(
unz(temp, 'transcript_rna_celline.tsv'),
stringsAsFactors = FALSE,
check.names = FALSE,
strip.white = TRUE,
sep="\t",
na.strings = c("", " ")
) %>% as_tibble()
unlink(temp)
## Old version used tidyr::gather
# transcript_rna_cell_line <- tidyr::gather(transcript_rna_cell_line,
# key='cell_line',
# value='value',
# -'ensgid', -'enstid')
## New version use stats::reshape
transcript_rna_cell_line <-
stats::reshape(
transcript_rna_cell_line,
direction = "long",
varying = list(3:ncol(transcript_rna_cell_line)),
v.names = "value",
timevar = "cell_line",
times = c(colnames(transcript_rna_cell_line[, 3:ncol(transcript_rna_cell_line)]))
) %>%
subset(select = -id)
colnames(transcript_rna_cell_line) <- transcriptRnaCellLineColnames
loadedData$transcript_rna_cell_line <- transcript_rna_cell_line
}
rm(temp)
}
return(loadedData)
}
#################
## Subset data ##
#################
#' Subset downloaded data
#'
#' \code{hpaSubset()} subsets data by gene name, tissue, cell type, cancer and/or
#' cell line. The input is the list object generated by \code{hpaDownload()} or
#' as the output of another \code{hpaSubset()}. Use \code{hpaListParam()} to see
#' the list of available parameters for a specific list object. Will not work on
#' isoform data.
#'
#' @param data Input the list object generated by \code{hpaDownload()} or
#' \code{hpaSubset()}
#' @param targetGene Vector of strings of HGNC gene symbols. It will be used to
#' subset every dataset in the list object. You can also mix HGNC gene symbols
#' and ensemnl ids (start with ENSG) and they will be converted to HGNC gene
#' symbols.
#' @param targetTissue Vector of strings of normal tissues. Will be used to
#' subset the \code{normal_tissue} and \code{rna_tissue} dataset.
#' @param targetCellType Vector of strings of normal cell types. Will be used to
#' subset the \code{normal_tissue} dataset.
#' @param targetCancer Vector of strings of cancer types. Will be used to subset
#' the \code{pathology} dataset.
#' @param targetCellLine Vector of strings of cell lines. Will be used to subset
#' the \code{rna_cell_line} dataset.
#'
#' @return \code{hpaSubset} will return a list of tibbles as the result of
#' subsetting, depending on the input data.
#'
#' @family downloadable datasets functions
#'
#' @rdname hpaListParam
#'
#' @examples
#' downloadedData <- hpaDownload(downloadList='all', version='example')
#' geneList <- c('TP53', 'EGFR')
#' tissueList <- c('breast', 'cerebellum', 'skin 1')
#' cancerList <- c('breast cancer', 'glioma', 'melanoma')
#'
#' subsetData <- hpaSubset(data=downloadedData,
#' targetGene=geneList,
#' targetTissue=tissueList,
#' targetCancer=cancerList)
#'
#' @import dplyr
#' @export
hpaSubset <- function(data=NULL,
targetGene=NULL,
targetTissue=NULL,
targetCellType=NULL,
targetCancer=NULL,
targetCellLine=NULL) {
# Check if data is provided or not
if (is.null(data)) {
message(paste0('No data provided. Use version ',
hpa_histology_data$metadata$HPAversion,
"."))
data = HPAanalyze::hpa_histology_data
}
if('normal_tissue' %in% names(data)) {
if(!is.null(targetGene)) {
targetGene <- gene_ensembl_convert(targetGene, "gene")
data$normal_tissue <-
filter(data$normal_tissue, gene %in% targetGene)
}
if(!is.null(targetTissue)) {
data$normal_tissue <-
filter(data$normal_tissue, tissue %in% targetTissue)
}
if(!is.null(targetCellType)) {
data$normal_tissue <-
filter(data$normal_tissue, cell_type %in% targetCellType)
}
}
if('pathology' %in% names(data)) {
if(!is.null(targetGene)) {
targetGene <- gene_ensembl_convert(targetGene, "gene")
data$pathology <-
filter(data$pathology, gene %in% targetGene)
}
if(!is.null(targetCancer)) {
data$pathology <-
filter(data$pathology, cancer %in% targetCancer)
}
}
if('subcellular_location' %in% names(data)) {
if(!is.null(targetGene)) {
targetGene <- gene_ensembl_convert(targetGene, "gene")
data$subcellular_location <-
filter(data$subcellular_location, gene %in% targetGene)
}
}
if('rna_tissue' %in% names(data)) {
if(!is.null(targetGene)) {
targetGene <- gene_ensembl_convert(targetGene, "gene")
data$rna_tissue <-
filter(data$rna_tissue, gene %in% targetGene)
}
if(!is.null(targetTissue)) {
data$rna_tissue <-
filter(data$rna_tissue, tissue %in% targetTissue)
}
}
if('rna_cell_line' %in% names(data)) {
if(!is.null(targetGene)) {
targetGene <- gene_ensembl_convert(targetGene, "gene")
data$rna_cell_line <-
filter(data$rna_cell_line, gene %in% targetGene)
}
if(!is.null(targetCellLine)) {
data$rna_cell_line <-
filter(data$rna_cell_line, cell_line %in% targetCellLine)
}
}
return(data)
}
#########################
## List available data ##
#########################
#' List available data
#'
#' \code{hpaListParam()} list available variables in downloaded data that can be
#' used as parameters to subset the data via \code{hpaSubset()}. This function
#' work with the data object generated by \code{hpaDownload()} or a previous
#' call of \code{hpaSubset()}.
#'
#' @return The output of \code{hpaListParam()} is a list of vectors containing
#' all subset parameter for the downloaded data.
#'
#' @examples
#' downloadedData <- hpaDownload(downloadList='all', version='example')
#' params <- hpaListParam(data=downloadedData)
#' params$normal_tissue
#'
#' @import dplyr
#' @importFrom stats na.omit
#' @export
hpaListParam <- function(data=NULL) {
# Check if data is provided or not
if (is.null(data)) {
message(paste0('No data provided. Use version ',
hpa_histology_data$metadata$HPAversion,
"."))
data = HPAanalyze::hpa_histology_data
}
availData <- list()
if('normal_tissue' %in% names(data)) {
availData$normal_tissue <- unique(data$normal_tissue[['tissue']])
availData$normal_cell <- unique(data$normal_tissue[['cell_type']])
}
if('pathology' %in% names(data)) {
availData$cancer <- unique(data$pathology[['cancer']])
}
if('subcellular_location' %in% names(data)) {
availData$subcellular_location <- unique(data$subcellular_location[['approved']])%>%
strsplit(';') %>% unlist() %>%
unique() %>% na.omit() %>% as.vector()
}
if('rna_tissue' %in% names(data)) {
availData$normal_tissue_rna <- unique(data$rna_tissue[['tissue']])
}
if('rna_cell_line' %in% names(data)) {
availData$cell_line_rna <- unique(data$rna_cell_line[['cell_line']])
}
return(availData)
}
#################
## Export data ##
#################
#' Export the subset data
#'
#' Export the list object generated by \code{hpaSubset()} into xlsx format. Due
#' to the size of some HPA datasets, as well as the limitation of the output
#' format, exporting the full datasets generated by \code{hpaDownload()} is not
#' recommended.
#'
#' @param data Input the list object generated by \code{hpaSubset()}
#' @param fileName A string indicate the desired output file name. Do not
#' include file extension such as \code{'.xlsx'}.
#' @param fileType The format as which the data will be exported. Choose one of
#' these options: \code{'xlsx'}, \code{'csv'} and \code{'tsv'}.
#'
#' @return
#' \itemize{
#' \item \code{'xlsx'}: return one .xlsx file named \code{'fileName.xlsx'}.
#' One individual sheet for each dataset in the input list object.
#' \item \code{'csv'}: return .csv files, one for each dataset in the input
#' list object, named \code{'fileName_datasetName.csv'}
#' \item \code{'tsv'}: return .tsv files, one for each dataset in the input
#' list object, named \code{'fileName_datasetName.tsv'}
#' }
#'
#' @family downloadable datasets functions
#'
#' @examples
#' downloadedData <- hpaDownload(downloadList='all', version='example')
#' geneList <- c('TP53', 'EGFR')
#' tissueList <- c('breast', 'cerebellum', 'skin 1')
#' cancerList <- c('breast cancer', 'glioma', 'melanoma')
#'
#' subsetData <- hpaSubset(data=downloadedData,
#' targetGene=geneList,
#' targetTissue=tissueList,
#' targetCancer=cancerList)
#' hpaExport(data=subsetData,
#' fileName='TP53_EGFR_in_tissue_cancer.xlsx',
#' fileType='xlsx')
#'
#' @importFrom openxlsx write.xlsx
#' @importFrom utils write.csv write.table
#' @export
hpaExport <- function(data, fileName, fileType='xlsx') {
if(fileType == 'xlsx') {
write.xlsx(data, file = paste0(fileName, ".xlsx"))
}
if(fileType == 'csv') {
for (i in 1:length(data)) {
write.csv(data[[i]],
file = paste0(fileName, "_", names(data[i]), ".csv"))
}
}
if(fileType == 'tsv') {
for (i in 1:length(data)) {
write.table(data[[i]],
file = paste0(fileName, "_", names(data[i]), ".tsv"),
sep = "\t")
}
}
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.