### Constants
# TxDb.* family of packages
TXDBS = c(
'TxDb.Dmelanogaster.UCSC.dm3.ensGene',
'TxDb.Dmelanogaster.UCSC.dm6.ensGene',
'TxDb.Ggallus.UCSC.galGal5.refGene',
'TxDb.Hsapiens.UCSC.hg19.knownGene',
'TxDb.Hsapiens.UCSC.hg38.knownGene',
'TxDb.Mmusculus.UCSC.mm9.knownGene',
'TxDb.Mmusculus.UCSC.mm10.knownGene',
'TxDb.Rnorvegicus.UCSC.rn4.ensGene',
'TxDb.Rnorvegicus.UCSC.rn5.refGene',
'TxDb.Rnorvegicus.UCSC.rn6.refGene')
# org.* family of packages
ORGDBS = data.frame(
genome = c('dm3','dm6','galGal5','hg19','hg38','mm9','mm10','rn4','rn5','rn6'),
org = c('Dm','Dm','Gg','Hs','Hs','Mm','Mm','Rn','Rn','Rn'),
stringsAsFactors = FALSE)
HMMCELLLINES = c('Gm12878','H1hesc','Hepg2','Hmec','Hsmm','Huvec','K562','Nhek','Nhlf')
HMMCODES = c('1_Active_Promoter', '2_Weak_Promoter' ,'3_Poised_Promoter' ,'4_Strong_Enhancer', '5_Strong_Enhancer', '6_Weak_Enhancer', '7_Weak_Enhancer', '8_Insulator', '9_Txn_Transition', '10_Txn_Elongation', '11_Weak_Txn', '12_Repressed', '13_Heterochrom/lo', '14_Repetitive/CNV')
#' Function to recode classes from chromHMM type column
#'
#' @param hmm_codes in the original form from UCSC Genome Browser track.
#'
#' @return A character vector of chromHMM classes with numbers and underscores removed.
reformat_hmm_codes = function(hmm_codes) {
new_codes = sapply(hmm_codes,
function(hmm){paste(unlist(strsplit(hmm,'_'))[-1],collapse='')},
USE.NAMES=FALSE)
return(new_codes)
}
#' Function to return cell line from chromatin annotation shortcut
#'
#' @param shortcut The annotation shortcut, used in \code{build_annotations()}.
#'
#' @return A string of the cell line used in a chromatin annotation shortcut
get_cellline_from_shortcut = function(shortcut) {
return(unlist(strsplit(unlist(strsplit(shortcut,'_'))[2], '-'))[1])
}
#' Function to return cell line from chromatin annotation code
#'
#' @param code The annotation code, used in \code{build_annotations()}.
#'
#' @return A string of the cell line used in a chromatin annotation code
get_cellline_from_code = function(code) {
return(unlist(strsplit(unlist(strsplit(code,'_'))[3], '-'))[1])
}
#' Function listing which annotations are available.
#'
#' This includes the shortcuts. The \code{expand_annotations()} function helps
#' handle the shortcuts.
#'
#' @return A character vector of available annotations.
#'
#' @examples
#' builtin_annotations()
#'
#' @export
builtin_annotations = function() {
# Create annotation code endings
shortcut_ends = c('basicgenes','cpgs')
# Gene codes
gene_genomes = annotatr::builtin_genomes()
gene_ends = c('1to5kb', 'promoters', 'cds', '5UTRs', 'exons', 'firstexons', 'introns', 'intronexonboundaries', 'exonintronboundaries', '3UTRs', 'intergenic')
# CpG codes
cpg_genomes = base::setdiff(annotatr::builtin_genomes(),c('dm3','dm6'))
cpg_ends = c('islands', 'shores', 'shelves', 'inter')
# Chromatin state codes
# Remove numbers, and underscores, and take unique
chromatin_recode = unique(reformat_hmm_codes(HMMCODES))
chromatin_ends = apply(
expand.grid(HMMCELLLINES, chromatin_recode, stringsAsFactors = FALSE),
1, paste, collapse='-')
chromatin_shortcut_ends = apply(
expand.grid(HMMCELLLINES, 'chromatin', stringsAsFactors = FALSE),
1, paste, collapse='-')
# Create full annotation codes
gene_codes = apply(
expand.grid(gene_genomes, 'genes', gene_ends, stringsAsFactors = FALSE),
1, paste, collapse='_')
cpg_codes = apply(
expand.grid(cpg_genomes, 'cpg', cpg_ends, stringsAsFactors= FALSE),
1, paste, collapse='_')
chromatin_codes = apply(
expand.grid('hg19', 'chromatin', chromatin_ends, stringsAsFactors=FALSE),
1, paste, collapse='_')
enhancer_codes = c('hg19_enhancers_fantom','hg38_enhancers_fantom','mm9_enhancers_fantom','mm10_enhancers_fantom')
lncrna_codes = c('hg19_lncrna_gencode','hg38_lncrna_gencode','mm10_lncrna_gencode')
gene_shortcut_codes = apply(
expand.grid(gene_genomes, 'basicgenes', stringsAsFactors = FALSE),
1, paste, collapse='_')
cpg_shortcut_codes = apply(
expand.grid(cpg_genomes, 'cpgs', stringsAsFactors = FALSE),
1, paste, collapse='_')
chromatin_shortcut_codes = paste('hg19', chromatin_shortcut_ends, sep='_')
# Create the big vector of supported annotations
annots = c(gene_codes, cpg_codes, chromatin_codes, enhancer_codes, lncrna_codes,
gene_shortcut_codes, cpg_shortcut_codes, chromatin_shortcut_codes)
return(annots)
}
#' Function returning supported TxDb.* genomes
#'
#' @return A character vector of genomes for supported TxDb.* packages
#'
#' @examples
#' builtin_genomes()
#'
#' @export
builtin_genomes = function() {
return(ORGDBS$genome)
}
#' Function to get correct TxDb.* package name based on genome
#'
#' @param genome A string giving the genome assembly.
#'
#' @return A string giving the name of the correct TxDb.* package name based on \code{genome}.
get_txdb_name = function(genome = annotatr::builtin_genomes()) {
# Ensure valid arguments
genome = match.arg(genome)
db = grep(genome, TXDBS, value = TRUE)
return(db)
}
#' Function to get correct org.* package name based on genome
#'
#' @param genome A string giving the genome assembly.
#'
#' @return A string giving the correct org for org.db packages. e.g. hg19 -> Hs.
get_orgdb_name = function(genome = annotatr::builtin_genomes()) {
# Ensure valid arguments
genome = match.arg(genome)
org = ORGDBS[ORGDBS$genome == genome, 'org']
return(org)
}
#' Function to tidy up annotation accessors for visualization
#'
#' @param annotations A character vector of annotations, in the order they are to appear in the visualization.
#'
#' @return A list of mappings from original annotation names to names ready for visualization.
#' @export
tidy_annotations = function(annotations) {
tidy = sapply(annotations, function(a){
tokens = unlist(strsplit(a,'_'))
if(tokens[2] == 'cpg') {
if(tokens[3] == 'inter') {
return('interCGI')
} else {
return(paste('CpG', tokens[3]))
}
} else if (tokens[2] == 'genes') {
if(tokens[3] == 'firstexons') {
return('first exons')
} else if (tokens[3] == 'intronexonboundaries') {
return('intron/exon boundaries')
} else if (tokens[3] == 'exonintronboundaries') {
return('exon/intron boundaries')
} else {
return(tokens[3])
}
} else if (tokens[2] == 'enhancers') {
return('enhancers')
} else if (tokens[2] == 'chromatin') {
return(tokens[3])
} else if (tokens[2] == 'custom') {
return(tokens[3])
} else if (tokens[2] == 'lncrna') {
return('GENCODE lncRNA')
} else {
return(sprintf('%s %s', tokens[2], tokens[3]))
}
})
flip_tidy = names(tidy)
names(flip_tidy) = tidy
return(as.list(flip_tidy))
}
#' Function to check for valid annotations
#'
#' Gives errors if any annotations are not in builtin_annotations() (and they are not in the required custom format), basicgenes are used, or the genome prefixes are not the same for all annotations.
#'
#' @param annotations A character vector of annotations possibly using the shortcuts
#' @return If all the checks on the annotations pass, returns NULL to allow code to move forward.
check_annotations = function(annotations) {
# Pull out any custom annotations before checking
custom_annotations = grep('custom', annotations, value = TRUE)
annotations = base::setdiff(annotations, custom_annotations)
# Check that the annotations are supported, tell the user which are unsupported
if( !all(annotations %in% annotatr::builtin_annotations()) ) {
unsupported = base::setdiff(annotations, annotatr::builtin_annotations())
stop(sprintf('Error: "%s" is(are) not supported. See builtin_annotations().',
paste(unsupported, collapse=', ')))
}
# Recombine annotations and custom_annotations or you get failure when
# there are only custom annotations
annotations = c(custom_annotations, annotations)
genomes = sapply(annotations, function(a){
unlist(strsplit(a, '_'))[1]
}, USE.NAMES = FALSE)
# Check for same genome on all annotations
if( length(unique(genomes)) != 1 ){
stop('Error: genome prefix on all annotations must be the same.')
}
return(NULL)
}
#' Function to expand annotation shortcuts
#'
#' @param annotations A character vector of annotations, possibly using the shortcut accessors
#'
#' @return A vector of data accession-ized names that are ordered from upstream to downstream in the case of knownGenes and islands to interCGI in the case of cpgs.
#' @export
expand_annotations = function(annotations) {
are_basicgenes = any(grepl('basicgenes', annotations))
are_cpgs = any(grepl('cpgs', annotations))
are_hmms = any(grepl('-chromatin', annotations))
which_are_shortcuts = c(which(grepl('basicgenes', annotations)), which(grepl('cpgs', annotations)), which(grepl('-chromatin', annotations)))
# expand_shortcuts() will always be run after check_annotations() so we can be
# sure that the genome prefixes are the same for all annotaitons.
genome = unique( sapply(annotations, function(a){ unlist(strsplit(a, '_'))[1] }, USE.NAMES = FALSE) )
if(are_basicgenes || are_cpgs || are_hmms) {
# Check for shortcut annotation accessors 'cpgs', 'basicgenes'
# and create the right annotations based on the genome
new_annotations = c()
remove_shortcuts = c()
if(are_cpgs) {
new_annotations = paste(genome, 'cpg', c('islands','shores','shelves','inter'), sep='_')
}
if(are_basicgenes) {
new_annotations = c(new_annotations, paste(genome, 'genes', c('1to5kb','promoters','5UTRs','exons','introns','3UTRs'), sep='_'))
}
if(are_hmms) {
# Could conceivably use shortcuts for multiple cell lines
hmms = grep('-chromatin', annotations, value = TRUE)
cell_lines = sapply(hmms, get_cellline_from_shortcut, USE.NAMES = FALSE)
new_hmm_codes = apply(
expand.grid(cell_lines, unique(reformat_hmm_codes(HMMCODES)), stringsAsFactors = FALSE),
1, paste, collapse='-')
new_annotations = c(new_annotations,
paste(genome, 'chromatin', new_hmm_codes, sep='_'))
}
annotations = base::setdiff(c(annotations, new_annotations), annotations[which_are_shortcuts])
}
return(annotations)
}
#' Function to subset a tbl_df or grouped_df by a column
#'
#' @param tbl A \code{tbl_df} or \code{grouped_df}.
#' @param col A string indicating which column of of \code{tbl} to subset and order
#' @param col_order A character vector indicating the order of \code{col}.
#'
#' @return A modified version of \code{summary} with \code{col} subsetted by \code{col_order}.
#' @export
subset_order_tbl = function(tbl, col, col_order) {
if(!is.null(col)) {
# Collect all types in the column
all_col_names = unique(tbl[[col]])
# Inherit col_order from the order in tbl
if(is.null(col_order)) {
col_order = all_col_names
}
# Check set equality of col in the summary and the col_order
if( !dplyr::setequal(all_col_names, col_order) ) {
if( all(col_order %in% all_col_names) ) {
tbl = subset(tbl, tbl[[col]] %in% col_order)
} else {
# Intersect col_order with unique(tbl[[col]]) to deal with possible 0 tallies
col_order = intersect(col_order, unique(tbl[[col]]))
warning('There are elements in col_order that are not present in the corresponding column. Check for typos, or this could be a result of 0 tallies.')
}
}
# Convert fill to factor with levels in the correct order
tbl[[col]] = factor(tbl[[col]], levels = col_order)
# Also convert the levels to tidy names if fill is annotations
if(col == 'annot.type') {
levels(tbl[[col]]) = tidy_annotations(col_order)
}
}
return(tbl)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.