Nothing
# run "FUNC" with gene expression data from Allen Brain Atlas
#######################################################################################
# 1. check arguments and define parameters
# 2. load expression data
# 3. loop over age categories and cutoffs
# 3a) create input
# 3b) run func
# 3c) summarize func output
# 4. create output
#######################################################################################
aba_enrich=function(genes, dataset="adult", test="hyper", cutoff_quantiles=seq(0.1,0.9,0.1), n_randsets=1000, gene_len=FALSE, circ_chrom=FALSE, ref_genome="grch37", gene_coords=NULL, silent=FALSE){
##### 1. Check arguments and define parameters
## still allow vector as input for hyper and wilcox (like in older versions)
if (!((is.vector(genes) && !is.null(names(genes))) || is.data.frame(genes))){
stop("Please provide a data frame as 'genes' input (also named vector is still accepted for hypergeometric or wilcoxon rank-sum test).")
}
if (is.vector(genes)){
genes = data.frame(gene=names(genes), score=unname(genes), stringsAsFactors=FALSE)
} else {
genes[,1] = as.character(genes[,1])
}
## Define dataset dependent parameters
if (dataset=="adult"){
folder_ext="adult"
root_id="Allen:4005"
} else if (dataset=="5_stages"){
folder_ext="developmental"
root_id="Allen:10153"
} else if (dataset=="dev_effect"){
folder_ext="developmental"
root_id="Allen:10153"
} else {
stop("Not a valid dataset. Please use 'adult', '5_stages' or 'dev_effect'.")
}
## Check arguments
if (!silent) message("Checking arguments...")
# genes
if (anyNA(genes)){
stop("NAs found in 'genes'-input. Please provide finite values only.")
}
if (test=="hyper"){
if (!(is.data.frame(genes) && ncol(genes)==2 && is.numeric(genes[,2]))){
stop("Please provide a data frame with 2 columns [gene, 0/1] as input for hypergeometric test.")
}
if (!(all(genes[,2] %in% c(1,0)))){
stop("Please provide only 1/0-values in 2nd column of 'genes'-input for hypergeometric test.")
}
if (sum(genes[,2])==0){
stop("Only background genes defined (only 0s in 'genes[,2]'). Please provide candidate genes denoted by 1.")
}
} else if (test=="wilcoxon"){
if (!(is.data.frame(genes) && ncol(genes)==2 && is.numeric(genes[,2]))){
stop("Please provide a data frame with 2 columns [gene, score] as input for wilcoxon rank-sum test.")
}
if (nrow(genes) < 2){
stop("Only one gene provided as input.")
}
} else if (test=="binomial"){
if (!(is.data.frame(genes) && ncol(genes)==3 && all(sapply(genes,is.numeric) == c(0,1,1)))){
stop("Please provide a data frame with columns [gene, count1, count2] as input for binomial test.")
}
if (any(genes[,2:3] < 0) | any(genes[,2:3] != round(genes[,2:3]))){
stop("Please provide non-negative integers in columns 2-3.")
}
} else if (test=="contingency"){
if (!(is.data.frame(genes) && ncol(genes)==5 && all(sapply(genes,is.numeric) == c(0,1,1,1,1)))){
stop("Please provide a data frame with columns [gene, count1A, count2A, count1B, count2B] as input for contingency table test.")
}
if (any(genes[,2:5] < 0) | any(genes[,2:5] != round(genes[,2:5]))){
stop("Please provide non-negative integers in columns 2-5.")
}
} else {
stop("Not a valid test. Please use 'hyper', 'wilcoxon', 'binomial' or 'contingency'.")
}
# check for multiple assignment for one gene
genes = unique(genes) # allow for multiple assignment of same value
multi_genes = sort(unique(genes[,1][duplicated(genes[,1])]))
if (length(multi_genes) > 0){
if (test == "hyper"){
# allow but remove background definition of candidate genes
# (FUNC just takes one value per gene, candidate genes are implicitly part of background)
candi_genes = genes[genes[,2]==1, 1]
genes = genes[!(genes[,1] %in% candi_genes & genes[,2] == 0),]
} else {
stop("Genes with multiple assignment in input: ", paste(multi_genes,collapse=", "))
}
}
# other arguments
if (class(cutoff_quantiles)!="numeric"){
stop("Please enter numeric cutoff_quantiles.")
}
if (min(cutoff_quantiles)<=0 | max(cutoff_quantiles)>=1){
stop("Please enter cutoff_quantiles between 0 and 1 (exclusive).")
}
valid_datasets = c("adult","5_stages","dev_effect")
if (!(dataset %in% valid_datasets)){
stop("Not a valid dataset. Please use 'adult', '5_stages' or 'dev_effect'.")
}
if (length(n_randsets)>1 || !is.numeric(n_randsets) || n_randsets<1){
stop("Please define 'n_randsets' as a positive integer.")
}
if (n_randsets != round(n_randsets)){
n_randsets = round(n_randsets)
warning(paste("'n_randsets' is expected to be an integer and was rounded to ",n_randsets,".",sep=""))
}
if (!is.logical(gene_len)){
stop("Please set gene_len to TRUE or FALSE.")
}
if (!is.logical(circ_chrom)){
stop("Please set circ_chrom to TRUE or FALSE.")
}
if (gene_len & !(test == "hyper")){
stop("Argument 'gene_len = TRUE' can only be used with 'test = 'hyper''.")
}
if (!(ref_genome %in% c("grch37", "grch38"))){
stop("Not a valid ref_genome. Please use 'grch37' or 'grch38'.")
}
# Create tempfile prefix (in contrast to tempdir() alone, this allows parallel processing)
directory = tempfile()
# dir.create("tempdir"); directory = paste("./tempdir/tempfile",Sys.info()["nodename"],sep="_")
# load gene_list and get gene identifier
gene_symbols = get(paste("gene_symbols",folder_ext,sep="_"))
identifier = detect_identifier(genes[1,1])
if (identifier == "blocks"){
regions = TRUE
} else {
regions = FALSE
}
# load gene_coords
custom_coords = !is.null(gene_coords)
if (gene_len || regions){
if (custom_coords){
# internal gene_coords has chr,start,stop,hgnc,ensembl,entrez
# get_genes_from regions just takes gene_coords[,4] - identifier doesnt matter
gene_coords = gene_coords[,c(2:4,1)]
identifier = detect_identifier(gene_coords[1,4])
colnames(gene_coords)[4] = identifier
} else {
gene_coords = get(paste("gene_coords_", ref_genome, sep=""))
# only for gene-len + default coords initial identifier stays
if (regions){
identifier = detect_identifier(gene_coords[1,4])
}
}
# remove genes with multiple coords
gene_coords = gene_coords[,c(1:3, which(colnames(gene_coords) == identifier))]
gene_coords = unique(gene_coords)
multi_coords = sort(unique(gene_coords[,4][duplicated(gene_coords[,4])]))
gene_coords = gene_coords[! gene_coords[,4] %in% multi_coords,]
}
if (regions){
# check that background region is specified
if (sum(genes[,2]) == nrow(genes)){
stop("All values of the 'genes' input are 1. Using chromosomal regions as input requires defining background regions with 0.")
}
# check that test is hyper
if (test != "hyper"){
stop("chromosomal regions can only be used with test='hyper'.")
}
# warn if gene_len=TRUE, although regions are used
if (gene_len){
warning("Unused argument: 'gene_len = TRUE'.")
}
# convert coords from genes-vector to bed format, SORT, and extract genes overlapping test regions
bed = get_genes_from_regions(genes, gene_coords, circ_chrom) # gene_coords from sysdata.rda HGNC
test_regions = bed[[1]]
bg_regions = bed[[2]]
genes = bed[[3]]
# since custom gene_coords are allowed this can be any of the 3
identifier = detect_identifier(genes[1,1])
# avoid scientific notation in regions (read in c++)
test_regions = format(test_regions, scientific=FALSE, trim=TRUE)
bg_regions = format(bg_regions, scientific=FALSE, trim=TRUE)
if (!silent){
message("Candidate regions:")
print(test_regions)
message("Background regions:")
print(bg_regions)
}
# write regions to files
write.table(test_regions,file=paste(directory, "_test_regions.bed",sep=""),col.names=FALSE,row.names=FALSE,quote=FALSE,sep="\t")
write.table(bg_regions,file=paste(directory, "_bg_regions.bed",sep=""),col.names=FALSE,row.names=FALSE,quote=FALSE,sep="\t")
} else {
if (circ_chrom == TRUE){
# warn if circ_chrom=TRUE, although individual genes are used
warning("Unused argument: 'circ_chrom = TRUE'.")
}
}
if (ref_genome == "grch38" & (!(regions | gene_len) | custom_coords)){
# warn if ref-genome is switched to hg20 but not used
warning("Unused argument: 'ref_genome = grch38'.")
}
gene_list = gene_symbols[,identifier]
# restrict to genes that have expression data annotated and warn about the rest
gene_values = genes[genes[,1] %in% gene_list, ] # restrict
not_in = unique(genes[!genes[,1] %in% gene_values[,1], 1]) # removed
if (length(not_in) > 0 && !regions){
not_in_string = paste(not_in,collapse=", ")
warning(paste("No expression data for genes: ",not_in_string,".\n These genes were not included in the analysis.",sep=""))
}
# restrict to genes that have coordinates and warn about the rest
if (gene_len){
# check that custom coords and gene input match
if (!is.null(gene_coords) && identifier != colnames(gene_coords)[4]){
stop("Gene identifiers in 'genes' and 'gene_coords' arguments do not match.")
}
# removed
no_coords = unique(gene_values[!(gene_values[,1] %in% gene_coords[,identifier]), 1])
if (length(no_coords) > 0){
gene_values = gene_values[!(gene_values[,1] %in% no_coords), ] # restrict
no_coords_string = paste(no_coords,collapse=", ")
warning(paste("No coordinates available for genes: ",no_coords_string,".\n These genes were not included in the analysis.",sep=""))
not_in = c(not_in, no_coords)
}
}
# after removing genes without expression data or coordinates: are enough genes left?
if (length(not_in) > 0){
# is any gene in the data? if not, stop.
if (nrow(gene_values)==0) {
stop("No requested genes in data.")
}
# for 0/1 data: are test genes and background genes in the data (background only if background specified)?
if (test=="hyper" && sum(gene_values[,2])==0) {
stop("No requested test genes in data.")
}
if (test=="hyper" && 0 %in% genes[,2] && all(gene_values[,2]==1)) {
stop("No requested background genes in data.") # although background was defined
}
# at least two for wilcoxon
if (test=="wilcoxon" && nrow(gene_values) < 2) {
stop(paste("Less than 2 genes have annotated GO-categories.",sep=""))
}
}
# candidate genes
if (test=="hyper"){
interesting_genes = gene_values[gene_values[,2]==1, 1]
}
##### 2. Load expression data
# load pre input
if (!silent) message("Loading dataset...")
gene_expr = load_by_name(paste("dataset",dataset,sep="_"), silent)
gene_expr = as.data.table(gene_expr)
# compute cutoffs
if (!silent) message("Computing cutoffs... ")
cutoff_quantiles = sort(cutoff_quantiles)
cutoffs = gene_expr[,list(q=quantile(signal, probs=cutoff_quantiles)), by=age_category]
# write to a list (to be consistent with tapply-approach that was previously used)
cutoff_list = tapply(cutoffs$q, cutoffs$age_category, function(x){
names(x) = paste(100*cutoff_quantiles, "%", sep=""); return(x)}, simplify=FALSE)
# select gene identifier
gene_expr = gene_expr[,c("age_category",identifier,"structure","signal"), with=FALSE]
colnames(gene_expr)[2] = "gene_id"
# restrain input to required genes (unless test=hypergeometric and background genes are not defined - all other genes are background in this case)
# else exlude NAs if entrez-ID, (hgnc and ensembl dont have NAs)
if (!(test=="hyper" & sum(genes[,2]) == nrow(genes))){
if (!silent) message("Select requested genes...")
gene_expr = gene_expr[gene_id %in% gene_values[,1],]
} else if (identifier == "entrezgene"){
if (!silent) message("Exclude NAs...")
gene_expr = gene_expr[!is.na(gene_expr$gene_id),]
}
# aggregate expression per gene (duplicates due to different identifiers)
if (!silent) message("Aggregate expression per gene...")
gene_expr_ag = gene_expr[,mean(signal), by=list(age_category, gene_id, structure )]
colnames(gene_expr_ag)[4] = "signal"
# Add "Allen:"-string to brain region IDs
gene_expr_ag[,structure:=paste("Allen:", gene_expr_ag[,structure], sep="")]
# load ontology and write files to tmp
term = get(paste("term",folder_ext,sep="_"))
term2term = get(paste("term2term",folder_ext,sep="_"))
graph_path = get(paste("graph_path",folder_ext,sep="_"))
go_paths = paste0(directory, c("_term.txt", "_term2term.txt", "_graph_path.txt"))
write.table(term,file=go_paths[1],col.names=FALSE,row.names=FALSE,quote=FALSE,sep="\t")
write.table(term2term,file=go_paths[2],col.names=FALSE,row.names=FALSE,quote=FALSE,sep="\t")
write.table(graph_path,file=go_paths[3],col.names=FALSE,row.names=FALSE,quote=FALSE,sep="\t")
##### 3. loop over age categories and cutoffs
# initialize output
out = data.frame()
for (s in unique(gene_expr_ag$age_category)){
stage_expr = gene_expr_ag[gene_expr_ag$age_category==s,2:4] # gene, structure, expression
# get cutoffs
cutoff = cutoff_list[[as.character(s)]]
for (i in 1:length(cutoff)){
## 3a) create input
if (!silent){
if (dataset=="5_stages"){
message(paste("Preparing age category ", s," with ",names(cutoff[i]),"-quantile expression cutoff for FUNC...",sep=""))
} else {
message(paste("Preparing data with ",names(cutoff[i]),"-quantile expression cutoff for FUNC...",sep=""))
}
}
# restrain input with cutoff
if (!silent) message(" Apply cutoff...")
# expressed_genes = expression for input genes in structures where it's above cutoff
# gene, structure, expression
expressed_genes = as.data.frame(stage_expr[stage_expr$signal >= cutoff[i],])
### prepare input data ('infile-data' and 'root' in c++ scripts)
# 'infile-data'
# gene | value1 | value2 ...
# subset genes-input to genes expressed at this age/cutoff
infile_data = gene_values[gene_values[,1] %in% expressed_genes[,1],] # gene|value1|value2...
# check if cutoff too high (wilcox needs 2, other tests 1 gene (but not meaningful...))
if (!silent){
message(" Check that there are sufficient genes above cutoff...")
}
breaky = FALSE
if (test != "hyper" && nrow(infile_data) < 2){ # hyper can have one candi-gene and default bg
breaky = TRUE
if (!silent){
warning(paste("No statistics for cutoff >= ",cutoff_quantiles[i],". Less than two genes have expression above cutoff.",sep=""))
}
}
# for hyper infile-data is just a column with candidate genes
# for hyper also check if candidate and background are present
if (test=="hyper"){
infile_data = infile_data[infile_data[,2]==1, 1]
if (length(infile_data)==0){
breaky = TRUE
warning(paste("No statistics for cutoff >= ",cutoff_quantiles[i],". No candidate gene expression above cutoff.",sep=""))
}
else if (length(infile_data)==nrow(expressed_genes)){
breaky = TRUE
warning(paste("No statistics for cutoff >= ",cutoff_quantiles[i],". No background gene expression above cutoff.",sep=""))
}
}
if (breaky){
# finish at this cutoff or throw error if this is first (lowest) cutoff
if (i==1){
stop("Expression cutoffs too high.")
} else {
break
}
}
# 'root'
# gene | (chrom | start | end) | Allen:1 Allen:2 Allen:3
anno = tapply(expressed_genes[,2], expressed_genes[,1], function(x) {paste(x,collapse=" ")})
gene = as.character(names(anno))
if (regions || gene_len){
# add coordinates
gene_position = gene_coords[match(gene, gene_coords[,identifier]),1:3]
root = data.frame(genes=gene, gene_position, nodes=as.character(anno))
# remove genes with unknown coordinates (possible for default bg in gene_len-option)
if (gene_len){
root = root[!is.na(root[,3]),]
}
# just in case - avoid scientific notation of gene coordinates
root[,3:4] = format(root[,3:4], scientific=FALSE, trim=TRUE)
} else {
root = data.frame(genes=gene, nodes=as.character(anno))
}
# write input-files to tmp-directory
# last priority rename root and infile-data files to all_genes_annotation and test_genes
write.table(infile_data,sep="\t",quote=FALSE,col.names=FALSE,row.names=FALSE,file=paste(directory,"_infile-data",sep=""))
write.table(root,sep="\t",quote=FALSE,col.names=FALSE,row.names=FALSE,file=paste(directory,"_",root_id,sep=""))
### 3b) run func
if (!silent) message("Run Func...\n")
if (test == "hyper"){
# randomset
if (regions & circ_chrom){
GOfuncR:::hyper_randset(paste(directory,"_",root_id,sep=""), n_randsets, directory,
go_paths[1], go_paths[2], go_paths[3], root_id, "roll" , silent)
} else if (regions){
GOfuncR:::hyper_randset(paste(directory,"_",root_id,sep=""), n_randsets, directory,
go_paths[1], go_paths[2], go_paths[3], root_id, "block", silent)
} else if (gene_len){
GOfuncR:::hyper_randset(paste(directory,"_",root_id,sep=""), n_randsets, directory,
go_paths[1], go_paths[2], go_paths[3], root_id, "gene_len", silent)
} else {
GOfuncR:::hyper_randset(paste0(directory,"_",root_id), n_randsets, directory,
go_paths[1], go_paths[2], go_paths[3], root_id, "classic", silent)
}
# category test
GOfuncR:::hyper_category_test(directory, 1, root_id, silent)
} else if (test == "wilcoxon"){
GOfuncR:::wilcox_randset(paste(directory,"_",root_id,sep=""), n_randsets, directory, go_paths[1], go_paths[2], go_paths[3], root_id, silent)
GOfuncR:::wilcox_category_test(directory, 1, root_id, silent)
} else if (test == "binomial"){
GOfuncR:::binom_randset(paste(directory,"_",root_id,sep=""), n_randsets, directory, go_paths[1], go_paths[2], go_paths[3], root_id, silent)
GOfuncR:::binom_category_test(directory, 1, root_id, silent)
} else if (test == "contingency"){
GOfuncR:::conti_randset(paste(directory,"_",root_id,sep=""), n_randsets, directory, go_paths[1], go_paths[2], go_paths[3], root_id, silent)
GOfuncR:::conti_category_test(directory, 1, root_id, silent)
}
## 3c) summarize func-output
groupy = read.table(paste(directory,"_category_test_out",sep=""))
# switch columns for binomial test high B - then high A to be more consistent
if (test == "binomial"){
groupy[,2:5] = groupy[,c(3,2,5,4)]
}
# remove expected and actual no. of candidate, ranksum (included in c++ files to be the same for go_enrich)
groupy = groupy[,1:5]
age_category = rep(s,nrow(groupy))
cutoff_quantile = rep(cutoff_quantiles[i],nrow(groupy))
cutoff_value = rep(cutoff[i],nrow(groupy))
groups = cbind(age_category,cutoff_quantile,cutoff_value,groupy)
# combine with output from previous cutoffs
out = rbind(out,groups)
} # end cutoffs
} # end ages
#####. 4 rearrange output
if (!silent) message("Creating output...")
# remove redundant rows (nodes with no data and only one child)
cluster = get(paste("node_clusters",folder_ext,sep="_"))
results = rearrange_output(out,cluster,term)
# rearrange cutoff-list
cutoffs = do.call(cbind.data.frame, cutoff_list)
colnames(cutoffs) = paste("age_category",colnames(cutoffs),sep="_")
# sort genes alphabetically (useful for regions input)
gene_values = gene_values[mixedorder(gene_values[,1]),]
rownames(gene_values) = 1:nrow(gene_values)
final_output = list(results=results, genes=gene_values, cutoffs=cutoffs)
if (!silent) message("\nDone.\n")
return(final_output)
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.