Nothing
identify_hotspots <- function(mutation_dataset, gene_data , snp_data,
min_n_muts = 5, MAF_thresh = 0.01,
flanking_region = c(200, 300),
poisson.thr = 0.01, percentage.thr = 0.15,
ratio.thr = 45, approach = "percentage"){
### Checking for argument requirements
if(length(min_n_muts) != 1L){
stop("min_n_muts should be an integer.")
}
if(!(length(MAF_thresh) == 1 & is.numeric(MAF_thresh) &
MAF_thresh < 1)){
stop("MAF_thresh must representing a valid minor allele
threshold below 1.")
}
if(!(length(poisson.thr) == 1 & is.numeric(poisson.thr) &
poisson.thr < 1)){
stop("poisson.thr must representing a valid probability
threshold below 1.")
}
if(!(length(percentage.thr) == 1 & is.numeric(percentage.thr) &
percentage.thr < 1)){
stop("percentage.thr must representing a valid percentage
threshold below 1.")
}
if(!(length(ratio.thr) == 1 & is.numeric(ratio.thr) &
ratio.thr > 1)){
stop("ratio.thr must representing a valid ratio
threshold above 1.")
}
if(!(approach %in% c("percentage", "ratio", "both"))){
stop("Approach must be defined as either percentage, ratio, or both.")
}
MutationData.colnames <- c("Ensembl_gene_id","Protein_residue", "Original_aa",
"Mutated_aa","Patient_id","Genomic_coordinate",
"Original_base", "Mutated_base")
gene_id <- MutationData.colnames[1]
protein_res <- MutationData.colnames[2]
original_aa <- MutationData.colnames[3]
mutated_aa <- MutationData.colnames[4]
patient_id <- MutationData.colnames[5]
genomic_coord <- MutationData.colnames[6]
if(sum(colnames(mutation_dataset) %in% MutationData.colnames) == 8){
mutation_dataset <- unique(mutation_dataset[, MutationData.colnames])
} else {
stop("Mutation_dataset should contain the following columns:
Ensembl_gene_id, Protein_residue, Original_aa, Mutated_aa,
Patient_id, Genomic_coordinate, Original_base, Mutated_base.")
}
DominoData.colnames <- c("Ensembl_gene_id","Representative_tr",
"cDNA_length", "Gene_name", "Uniprot_id")
if(sum(colnames(gene_data) %in% DominoData.colnames) == 5){
gene_data <- unique(gene_data[, DominoData.colnames])
} else {
stop("Gene_data should contain the following columns: Ensembl_gene_id,
Representative_tr, cDNA_length, Gene_name, Uniprot_id.")
}
if(!(length(flanking_region) == 2L | length(flanking_region) == 1L)){
stop("Flanking region size needs to be one or two integers.")
}
###
# unlist GRangesList object
if(is(mutation_dataset, "GRangesList")){
mutation_dataset <- unlist(mutation_dataset, use.names=FALSE)
data_type <- class(mutation_dataset)
}
# retrieve data from GRanges object
if(is(mutation_dataset, "GRanges")){
chr_nu <- as.numeric(seqnames (mutation_dataset))
positions <- start(mutation_dataset)
single_nt_ctrl <- end(mutation_dataset)
genomic_pos <- paste(chr_nu, positions, sep = ":")
mut_meta_data <- as.data.frame(GenomicRanges::mcols(mutation_dataset))
cols <- colnames(mut_meta_data)
ref <- grep("ref", cols, ignore.case = TRUE)
mut <- grep("mut", cols, ignore.case = TRUE)
aa <- grep("aa", cols, ignore.case = TRUE)
ref_aa <- intersect(ref, aa)
mut_aa <- intersect(mut, aa)
patient <- grep("patient", cols, ignore.case = TRUE)
gene_ident <- grep("gene", cols, ignore.case = TRUE)
prot <- grep("protein", cols, ignore.case = TRUE)
ppos <- grep("loc", cols, ignore.case = TRUE)
prot_pos <- intersect(prot, ppos)
mutation_dataset <- cbind(mut_meta_data[, c(gene_ident, prot_pos,
ref_aa, mut_aa, patient)],
genomic_pos)
ok_rows <- positions == single_nt_ctrl
mutation_dataset <- mutation_dataset[ok_rows,]
colnames(mutation_dataset) <- c ("Ensembl_gene_id", "Protein_residue",
"Original_aa", "Mutated_aa",
"Patient_id", "Genomic_coordinate")
}
# if gene_data is provided in TxDB format convert to data frame
if(is(gene_data, "TxDb")){
gene_data <- import_txdb(gene_data)
}
if(length(snp_data) > 0){
### convert data.frame into GPos
if(is(snp_data, "data.frame")){
chr_info <- paste("chr", snp_data$Chr_name)
freq_info <- snp_data$Minor_allele_freq
position_on_chr <- snp_data$Position_on_chr
snp_data <- GPos(seqnames=chr_info, pos= position_on_chr, stitch=FALSE)
GenomicRanges::mcols(snp_data)$Minor_allele_freq <- freq_info
}
### convert vcf into GPos
if(is(snp_data, "CollapsedVCF")){
vcf_table <- info(snp_data) ### za frequency
coords <- rowRanges(snp_data)
common_vars <- row.names(vcf_table[vcf_table$LDAF > MAF_thresh, ])
coords_common_vars <- coords[common_vars, ]
variant_start <- start(ranges(coords_common_vars))
variant_end <- end(ranges(coords_common_vars))
snps <- variant_start == variant_end
chromosomes <- as.character(seqnames(coords_common_vars[snps]))
freq_info <- vcf_table[common_vars[snps],"LDAF"]
chr_info = paste("chr", chromosomes)
position_on_chr = variant_start[snps]
snp_data <- GPos(seqnames=chr_info, pos= position_on_chr, stitch=FALSE)
GenomicRanges::mcols(snp_data)$Minor_allele_freq <- freq_info
}
snp_high_freq <- snp_data[snp_data$Minor_allele_freq > MAF_thresh]
snp_pos <- paste(as.numeric(seqnames(snp_high_freq)),
pos(snp_high_freq), sep = ":")
# filter out all snps
muts_snps <- mutation_dataset[,genomic_coord] %in% snp_pos
if(sum(muts_snps) > 0){
mutation_dataset <- mutation_dataset[!muts_snps,]
message(" ", sum(muts_snps), " mutations overlap with common
population SNPs and were removed.")
}
} else {warning("No SNP-table provided.")}
###
ns_mut_dataset <- mutation_dataset[
as.character(mutation_dataset[,original_aa]) !=
as.character(mutation_dataset[,mutated_aa]), ]
ns_mut_dataset <- unique(ns_mut_dataset[,c(gene_id,protein_res,patient_id)])
# count how many times mutation exists
ns_mut_dataset_key <- paste(ns_mut_dataset[,gene_id],
ns_mut_dataset[,protein_res], sep = "-")
t <- table(ns_mut_dataset_key)
muts_to_check <- names(t[t >= min_n_muts])
message(" There are in total ", length(muts_to_check),
" protein residues with ", min_n_muts, " or more mutation.")
if(length(muts_to_check) == 0){
stop("Exiting since there are no mutations to check.")
}
message(" ***Checking potential hotspots.***")
results <- data.frame(stringsAsFactors = FALSE)
field <- 1
for (s in muts_to_check){
g_poz <- unlist(strsplit(s, "-"))
gene <- g_poz[1]
mut_pos_numeric <- as.numeric(g_poz[2])
tot_mut_hotsp <- as.numeric(t[s])
# calculate amino acid length of protein
length_cdna <- as.numeric(gene_data[gene_data$Ensembl_gene_id==gene,
"cDNA_length"])[1]
if((length(length_cdna) == 0) || is.na(length_cdna)){
warning("No cDNA length provided for ", gene, ".")
} else {
length_aa_in <- (length_cdna/3)-1
length_aa <- round(length_aa_in)
# retrieve gene information
rep_tr <- as.character(gene_data[gene_data$Ensembl_gene_id==gene,
"Representative_tr"])
gene_symbol <- unique(as.character(gene_data[gene_data$Ensembl_gene_id==gene,
"Gene_name"]))
assoc_unip <- unique(as.character(gene_data [gene_data$Ensembl_gene_id==gene,
"Uniprot_id"]))
if(length(gene_symbol) > 1){
#warning("Several gene symbols for:, ", gene , paste(gene_symbol,collapse = ","))
gene_symbol <- gene_symbol[1]
}
boundary <- calculate_boundary(mut_pos_numeric, length_aa, flanking_region)
to_the_left1 <- boundary[["region_1"]][1]
to_the_left2 <- boundary[["region_2"]][1]
to_the_right1 <- boundary[["region_1"]][2]
to_the_right2 <- boundary[["region_2"]][2]
all_mut_1 = mutation_dataset[mutation_dataset[,gene_id] == gene &
mutation_dataset[,protein_res] >= to_the_left1 &
mutation_dataset[,protein_res] <= to_the_right1,
c(gene_id,protein_res,patient_id)]
all_mut_2 = mutation_dataset[mutation_dataset[,gene_id] == gene &
mutation_dataset[,protein_res] >= to_the_left2 &
mutation_dataset[,protein_res] <= to_the_right2,
c(gene_id,protein_res,patient_id)]
tot_n_mut_1 = nrow(all_mut_1)
tot_n_mut_2 = nrow(all_mut_2)
ratio_region_1 <- tot_mut_hotsp/tot_n_mut_1
ratio_region_2 <- tot_mut_hotsp/tot_n_mut_2
exp_per_res_1 <- tot_n_mut_1/(to_the_right1 - to_the_left1 + 1)
exp_per_res_2 <- tot_n_mut_2/(to_the_right2 - to_the_left2 + 1)
reg_leng1 = to_the_right1 - to_the_left1 + 1
p_excp_av1 = tot_n_mut_1/reg_leng1
p_result1 = ppois(q = tot_mut_hotsp, lambda = p_excp_av1,
lower.tail = FALSE)
reg_leng2 = to_the_right2 - to_the_left2 + 1
p_excp_av2 = tot_n_mut_2/reg_leng2
p_result2 = ppois(q = tot_mut_hotsp, lambda = p_excp_av2,
lower.tail = FALSE)
# select only residues that meet poisson.thr, percentage.thr and ratio.thr
if((p_result1 < poisson.thr) & (p_result2 < poisson.thr)){
write.line <- FALSE
if(approach == "percentage"){
if((ratio_region_1 > percentage.thr) & (ratio_region_2 > percentage.thr)){
write.line <- TRUE
}
}
else if(approach == "ratio"){
if(((tot_mut_hotsp/exp_per_res_1) > ratio.thr) &
((tot_mut_hotsp/exp_per_res_1) > ratio.thr)){
write.line <- TRUE
}
}
else if(approach == "both"){
if(((ratio_region_1 > percentage.thr) &
(ratio_region_2 > percentage.thr)) &
(((tot_mut_hotsp/exp_per_res_1) > ratio.thr) &
((tot_mut_hotsp/exp_per_res_1) > ratio.thr))){
write.line <- TRUE
}
}
if(write.line){
region_1 <- paste(to_the_left1, to_the_right1, sep="-")
region_2 <- paste(to_the_left2, to_the_right2, sep="-")
ratio_region_1 = sprintf("%1.2f%%", 100*ratio_region_1)
ratio_region_2 = sprintf("%1.2f%%", 100*ratio_region_2)
results[field, "Gene"] = gene_symbol
results[field, "Ensembl_id"] = gene
results[field, "Repres_tr"] = rep_tr
results[field, "Prot_position"] = mut_pos_numeric
results[field, "N_mut"] = tot_mut_hotsp
results[field, "Prot_length"] = length_aa
results[field, "Assoc_unip_ids"] = assoc_unip
results[field, "Perc_region_1"] = ratio_region_1
results[field, "Perc_region_2"] = ratio_region_2
results[field, "Poisson_p_value_1"] = p_result1
results[field, "Poisson_p_value_2"] = p_result2
field = field + 1
}
}
if(nrow(results) > 0){
results <- results[order(results[,"Poisson_p_value_2"], decreasing = FALSE), ]
results[,"Adj_p_value_region_1"] <- p.adjust(results[,"Poisson_p_value_2"],
method = "BH")
results[,"Adj_p_value_region_2"] <- p.adjust(results[,"Poisson_p_value_1"],
method = "BH")
results <- results[results$Adj_p_value_region_1 < poisson.thr &
results$Adj_p_value_region_2 < poisson.thr,]
}
}
}
#message (" ***Identified hotspots are stored in the object: results.***")
return(results)
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.