R/map_to_func_elem.R

Defines functions map_to_func_elem

Documented in map_to_func_elem

map_to_func_elem <- function(hotspot_results, write_to_file = "NO", ens_release = "109"){
    ## Checking for argument requirements
    if(!(write_to_file %in% c("YES", "NO"))){
        stop("write_to_file should either be YES or NO")
    }
    ###
    message ("  Obtaining sequences of proteins encoded by the representative Ensembl transcripts.")
    
    if(ens_release == 109) {
        ens_database <- "https://feb2023.archive.ensembl.org"
    } else {
        ens_database <- ens_release
    }
    
    #####
    # to fail gracefully if Ensembl is down
    # code from http://bioconductor.org/developers/how-to/web-query/
    N.TRIES <- as.integer(5)
    stopifnot(length(N.TRIES) == 1L, !is.na(N.TRIES))
    
    while (N.TRIES > 0L) {
        ensembl <- tryCatch(useMart(biomart="ENSEMBL_MART_ENSEMBL", host = ens_database, dataset = "hsapiens_gene_ensembl"), error=identity)
        if (!inherits(ensembl, "error"))
            break
        N.TRIES <- N.TRIES - 1L
    }
    
    if (N.TRIES == 0L) {
        stop("'useMart()' failed:",
             "\n  error: ", conditionMessage(ensembl))
    }
    #####
    # ensembl = useMart(biomart="ENSEMBL_MART_ENSEMBL", dataset = "hsapiens_gene_ensembl") # --- for current version
    
    transc_ids = hotspot_results$Repres_tr
    prot_seq = getSequence (id = transc_ids, type= "ensembl_transcript_id", seqType = "peptide", mart = ensembl)
    
    gff.file <- list()
    uni.sequences <- c()
    
    message ("  Obtaining UniProtKB sequences and coordinates of protein functional elements.")
    genes_w_hotspots <- unique(hotspot_results$Ensembl_id)
    for(gene_ens in genes_w_hotspots){
        gene_unip <- hotspot_results [hotspot_results$Ensembl_id==gene_ens, "Assoc_unip_ids"]
        assoc_unip_ids <- as.character(gene_unip)

        if(length(grep(",", assoc_unip_ids))){
            assoc_unip_ids <- trimws(unlist(strsplit(assoc_unip_ids, ",")))
        }
            
        for (unip_acc in assoc_unip_ids){
            if (!(is.na(unip_acc)))
            {
            if(nchar(unip_acc) == 6){
                retrieve_unip <- paste("http://www.uniprot.org/uniprot/", unip_acc, ".gff", sep = "")
                
                #####
                # to fail gracefully if Uniprot is not accessible
                # code from http://bioconductor.org/developers/how-to/web-query/
                N.TRIES <- as.integer(5)
                stopifnot(length(N.TRIES) == 1L, !is.na(N.TRIES))
                
                while (N.TRIES > 0L) {
                    result <- tryCatch(read.delim(retrieve_unip, skip = 2, header = FALSE), error=identity)
                    if (!inherits(result, "error"))
                        break
                    N.TRIES <- N.TRIES - 1L
                }
                
                if (N.TRIES == 0L) {
                    stop("Retrieving sequence from Uniprot failed:\n",
                         retrieve_unip,
                         "\n  error: ", conditionMessage(result))
                }
                #####
                gff.file[[unip_acc]] <- result
                    
                retrieve_unip <- paste ("http://www.uniprot.org/uniprot/", unip_acc, ".fasta", sep = "")
                
                #####
                # to fail gracefully if Uniprot is not accessible
                # code from http://bioconductor.org/developers/how-to/web-query/
                N.TRIES <- as.integer(5)
                stopifnot(length(N.TRIES) == 1L, !is.na(N.TRIES))
                
                while (N.TRIES > 0L) {
                    result <- tryCatch(readAAStringSet(retrieve_unip), error=identity)
                    if (!inherits(result, "error"))
                        break
                    N.TRIES <- N.TRIES - 1L
                }
                
                if (N.TRIES == 0L) {
                    stop("Reading sequence from Uniprot failed:\n",
                         , retrieve_unip,
                         "\n  error: ", conditionMessage(result))
                }
                #####
                
                uni.sequences <- c(uni.sequences, result)
                    
            }else {
                warning("IDs for following uniprot accessions could not be retrieved:", unip_acc, " that corresponds to:", gene_ens, ".")
            }
            }
        }
    }
    
    gff.file <- lapply(gff.file, function(x)x[,c(1,2,3,4,5,9)])
    gff.file <- lapply(gff.file, function(x){
        colnames(x) <- c("ID", "Source", "Type", "Start", "End", "Description"); x
    })
    
    uni.sequences <- do.call(c, uni.sequences)
    
    message ("  Assessing sequence agreement and mapping hotspots to functional elements.")
    pb <- txtProgressBar(min = 0, max = nrow(hotspot_results))

    for(i in seq_len(nrow(hotspot_results))){
        setTxtProgressBar(pb, i)
        #print(i)
        uni_genes <- hotspot_results[i, "Assoc_unip_ids"]
        ensembl_gene <- hotspot_results[i, "Ensembl_id"]
        ensembl_transcript <- hotspot_results[i, "Repres_tr"]
        ensembl_mut_position <- hotspot_results[i, "Prot_position"]
        ens.seq <- prot_seq[prot_seq$ensembl_transcript_id==ensembl_transcript, "peptide"]
        ids.uniprot <- c()
        all_unip <- unlist(strsplit(uni_genes, ","))
        for (uni_gene in all_unip)
        {
            id.unip <- grep(uni_gene, names(uni.sequences))
            ids.uniprot <- c(ids.uniprot, id.unip)
        }
        #### nema if, samp for loop za sve uniprot ids i posli uniq od overlapa
        
        unip_func_elem <- c()
        
        if((is.na(ens.seq)) || (ens.seq == "Sequence unavailable")) {
            warning ("No ensembl sequence available for ensembl_gene: ", ensembl_gene, " where the representative transcript is ", ensembl_transcript)
        } else if (is.na (ids.uniprot[1])) {
            # warning ("No UniProtKB identifier available for ensembl_gene: ", ensembl_gene)
        }
        else {
            ens.seq <- gsub("\\*", "", ens.seq)
            ens.seq <- AAString(ens.seq)
            
            for (id.uniprot in ids.uniprot) {
                uni.seq <- as.character(uni.sequences[[id.uniprot]])
                uni.seq <- gsub ("\\*", "", uni.seq)
                uni.seq <- AAString(uni.seq)
                
                names_details <- names (uni.sequences[id.uniprot])
                to_unip_access <- unlist(strsplit(names_details, "\\|"))
                unip_access <- to_unip_access[2]
                
                if(length(ens.seq) & length(uni.seq))
                {
                    #print (ensembl_transcript)
                    #print (unip_access)
                    
                    if(identical(ens.seq, uni.seq)){
                        hotsp.unip <- ensembl_mut_position
                        status <- "ok_match"
                    } else {
                        hotsp.in.unip <- align_to_unip(ens.seq, uni.seq, ensembl_mut_position)
                        status <- hotsp.in.unip[[1]]
                        hotsp.unip <- hotsp.in.unip[[2]]
                    }
                    
                    if (status == "ok_match") {
                        uniprot.data <- gff.file[[unip_access]]
                        if(length(uniprot.data)){
                            type_to_exclude = c ("CHAIN", "SEQUENCE CONFLICT", "NATURAL VARIANT", "VAR_SEQ", "TOPO_DOM", "STRAND", "HELIX", "COILED", "COMPBIAS", "NON_TER", "TURN", "MUTAGENESIS", "ALTERNATIVE SEQUENCE", "MODIFIED RESIDUE")
                            uniprot.data$Type = toupper (uniprot.data$Type)
                            uniprot.data = uniprot.data [!(uniprot.data$Type %in% type_to_exclude), ]
                            overlaping.elements = uniprot.data[(uniprot.data$Start <= hotsp.unip) & (uniprot.data$End >= hotsp.unip),]
                        
                            for (el in seq_len(nrow(overlaping.elements))){
                                el_type <- overlaping.elements[el, "Type"]
                                el_desc_all <- overlaping.elements[el, "Description"]
                                el_desc_spl <- unlist(strsplit(as.character(el_desc_all), ";"))
                                all_notes <- grep ("Note", el_desc_spl)
                                for(w_note in all_notes){
                                    note <- el_desc_spl[w_note]
                                    note <- gsub ("Note=", "", note)
                                    el_type <- tolower (el_type)
                                    func_elem <- paste (el_type, note, sep = ": ")
                                    unip_func_elem <- c(unip_func_elem, func_elem)
                                }
                            }
                        }
                    }
                }
                unip_func_elem = unique (unip_func_elem)
                if(length(unip_func_elem)){
                    hotspot_results[hotspot_results$Ensembl_id == ensembl_gene & hotspot_results$Prot_position == ensembl_mut_position, "Protein_funcional_region"] <- paste(unip_func_elem, collapse ="; ")
                } else {
                    hotspot_results[hotspot_results$Ensembl_id == ensembl_gene & hotspot_results$Prot_position == ensembl_mut_position, "Protein_funcional_region"] <- "NA"
                }
            }
        }
    }
    close(pb)
    
    hotspot_results = hotspot_results[,c(1,2,4,5,14,3,7,8,9,12,13,6)]
    if (write_to_file == "YES"){
      message ("   Saving results in the file Protein_hotspot_residues.txt.")
      write.table (hotspot_results, file = "Protein_hotspot_residues.txt", sep = "\t", row.names = FALSE, col.names = colnames (hotspot_results),quote = FALSE)
    }
    return (hotspot_results)
}
peterblattmann/DominoEffect documentation built on Nov. 9, 2023, 2:39 a.m.