map_to_func_elem <- function(hotspot_results, write_to_file = "NO", ens_release = "109"){
## Checking for argument requirements
if(!(write_to_file %in% c("YES", "NO"))){
stop("write_to_file should either be YES or NO")
}
###
message (" Obtaining sequences of proteins encoded by the representative Ensembl transcripts.")
if(ens_release == 109) {
ens_database <- "https://feb2023.archive.ensembl.org"
} else {
ens_database <- ens_release
}
#####
# to fail gracefully if Ensembl is down
# code from http://bioconductor.org/developers/how-to/web-query/
N.TRIES <- as.integer(5)
stopifnot(length(N.TRIES) == 1L, !is.na(N.TRIES))
while (N.TRIES > 0L) {
ensembl <- tryCatch(useMart(biomart="ENSEMBL_MART_ENSEMBL", host = ens_database, dataset = "hsapiens_gene_ensembl"), error=identity)
if (!inherits(ensembl, "error"))
break
N.TRIES <- N.TRIES - 1L
}
if (N.TRIES == 0L) {
stop("'useMart()' failed:",
"\n error: ", conditionMessage(ensembl))
}
#####
# ensembl = useMart(biomart="ENSEMBL_MART_ENSEMBL", dataset = "hsapiens_gene_ensembl") # --- for current version
transc_ids = hotspot_results$Repres_tr
prot_seq = getSequence (id = transc_ids, type= "ensembl_transcript_id", seqType = "peptide", mart = ensembl)
gff.file <- list()
uni.sequences <- c()
message (" Obtaining UniProtKB sequences and coordinates of protein functional elements.")
genes_w_hotspots <- unique(hotspot_results$Ensembl_id)
for(gene_ens in genes_w_hotspots){
gene_unip <- hotspot_results [hotspot_results$Ensembl_id==gene_ens, "Assoc_unip_ids"]
assoc_unip_ids <- as.character(gene_unip)
if(length(grep(",", assoc_unip_ids))){
assoc_unip_ids <- trimws(unlist(strsplit(assoc_unip_ids, ",")))
}
for (unip_acc in assoc_unip_ids){
if (!(is.na(unip_acc)))
{
if(nchar(unip_acc) == 6){
retrieve_unip <- paste("http://www.uniprot.org/uniprot/", unip_acc, ".gff", sep = "")
#####
# to fail gracefully if Uniprot is not accessible
# code from http://bioconductor.org/developers/how-to/web-query/
N.TRIES <- as.integer(5)
stopifnot(length(N.TRIES) == 1L, !is.na(N.TRIES))
while (N.TRIES > 0L) {
result <- tryCatch(read.delim(retrieve_unip, skip = 2, header = FALSE), error=identity)
if (!inherits(result, "error"))
break
N.TRIES <- N.TRIES - 1L
}
if (N.TRIES == 0L) {
stop("Retrieving sequence from Uniprot failed:\n",
retrieve_unip,
"\n error: ", conditionMessage(result))
}
#####
gff.file[[unip_acc]] <- result
retrieve_unip <- paste ("http://www.uniprot.org/uniprot/", unip_acc, ".fasta", sep = "")
#####
# to fail gracefully if Uniprot is not accessible
# code from http://bioconductor.org/developers/how-to/web-query/
N.TRIES <- as.integer(5)
stopifnot(length(N.TRIES) == 1L, !is.na(N.TRIES))
while (N.TRIES > 0L) {
result <- tryCatch(readAAStringSet(retrieve_unip), error=identity)
if (!inherits(result, "error"))
break
N.TRIES <- N.TRIES - 1L
}
if (N.TRIES == 0L) {
stop("Reading sequence from Uniprot failed:\n",
, retrieve_unip,
"\n error: ", conditionMessage(result))
}
#####
uni.sequences <- c(uni.sequences, result)
}else {
warning("IDs for following uniprot accessions could not be retrieved:", unip_acc, " that corresponds to:", gene_ens, ".")
}
}
}
}
gff.file <- lapply(gff.file, function(x)x[,c(1,2,3,4,5,9)])
gff.file <- lapply(gff.file, function(x){
colnames(x) <- c("ID", "Source", "Type", "Start", "End", "Description"); x
})
uni.sequences <- do.call(c, uni.sequences)
message (" Assessing sequence agreement and mapping hotspots to functional elements.")
pb <- txtProgressBar(min = 0, max = nrow(hotspot_results))
for(i in seq_len(nrow(hotspot_results))){
setTxtProgressBar(pb, i)
#print(i)
uni_genes <- hotspot_results[i, "Assoc_unip_ids"]
ensembl_gene <- hotspot_results[i, "Ensembl_id"]
ensembl_transcript <- hotspot_results[i, "Repres_tr"]
ensembl_mut_position <- hotspot_results[i, "Prot_position"]
ens.seq <- prot_seq[prot_seq$ensembl_transcript_id==ensembl_transcript, "peptide"]
ids.uniprot <- c()
all_unip <- unlist(strsplit(uni_genes, ","))
for (uni_gene in all_unip)
{
id.unip <- grep(uni_gene, names(uni.sequences))
ids.uniprot <- c(ids.uniprot, id.unip)
}
#### nema if, samp for loop za sve uniprot ids i posli uniq od overlapa
unip_func_elem <- c()
if((is.na(ens.seq)) || (ens.seq == "Sequence unavailable")) {
warning ("No ensembl sequence available for ensembl_gene: ", ensembl_gene, " where the representative transcript is ", ensembl_transcript)
} else if (is.na (ids.uniprot[1])) {
# warning ("No UniProtKB identifier available for ensembl_gene: ", ensembl_gene)
}
else {
ens.seq <- gsub("\\*", "", ens.seq)
ens.seq <- AAString(ens.seq)
for (id.uniprot in ids.uniprot) {
uni.seq <- as.character(uni.sequences[[id.uniprot]])
uni.seq <- gsub ("\\*", "", uni.seq)
uni.seq <- AAString(uni.seq)
names_details <- names (uni.sequences[id.uniprot])
to_unip_access <- unlist(strsplit(names_details, "\\|"))
unip_access <- to_unip_access[2]
if(length(ens.seq) & length(uni.seq))
{
#print (ensembl_transcript)
#print (unip_access)
if(identical(ens.seq, uni.seq)){
hotsp.unip <- ensembl_mut_position
status <- "ok_match"
} else {
hotsp.in.unip <- align_to_unip(ens.seq, uni.seq, ensembl_mut_position)
status <- hotsp.in.unip[[1]]
hotsp.unip <- hotsp.in.unip[[2]]
}
if (status == "ok_match") {
uniprot.data <- gff.file[[unip_access]]
if(length(uniprot.data)){
type_to_exclude = c ("CHAIN", "SEQUENCE CONFLICT", "NATURAL VARIANT", "VAR_SEQ", "TOPO_DOM", "STRAND", "HELIX", "COILED", "COMPBIAS", "NON_TER", "TURN", "MUTAGENESIS", "ALTERNATIVE SEQUENCE", "MODIFIED RESIDUE")
uniprot.data$Type = toupper (uniprot.data$Type)
uniprot.data = uniprot.data [!(uniprot.data$Type %in% type_to_exclude), ]
overlaping.elements = uniprot.data[(uniprot.data$Start <= hotsp.unip) & (uniprot.data$End >= hotsp.unip),]
for (el in seq_len(nrow(overlaping.elements))){
el_type <- overlaping.elements[el, "Type"]
el_desc_all <- overlaping.elements[el, "Description"]
el_desc_spl <- unlist(strsplit(as.character(el_desc_all), ";"))
all_notes <- grep ("Note", el_desc_spl)
for(w_note in all_notes){
note <- el_desc_spl[w_note]
note <- gsub ("Note=", "", note)
el_type <- tolower (el_type)
func_elem <- paste (el_type, note, sep = ": ")
unip_func_elem <- c(unip_func_elem, func_elem)
}
}
}
}
}
unip_func_elem = unique (unip_func_elem)
if(length(unip_func_elem)){
hotspot_results[hotspot_results$Ensembl_id == ensembl_gene & hotspot_results$Prot_position == ensembl_mut_position, "Protein_funcional_region"] <- paste(unip_func_elem, collapse ="; ")
} else {
hotspot_results[hotspot_results$Ensembl_id == ensembl_gene & hotspot_results$Prot_position == ensembl_mut_position, "Protein_funcional_region"] <- "NA"
}
}
}
}
close(pb)
hotspot_results = hotspot_results[,c(1,2,4,5,14,3,7,8,9,12,13,6)]
if (write_to_file == "YES"){
message (" Saving results in the file Protein_hotspot_residues.txt.")
write.table (hotspot_results, file = "Protein_hotspot_residues.txt", sep = "\t", row.names = FALSE, col.names = colnames (hotspot_results),quote = FALSE)
}
return (hotspot_results)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.