R/PrepareAnnotationEnsembl.R
In customProDB: Generate customized protein database from NGS data, with a focus on RNA-Seq data, for proteomics search

Documented in PrepareAnnotationEnsembl

##' prepare the annotation from ENSEMBL through biomaRt.
##'
##' this function automaticlly prepares all  annotation infromation needed in the following analysis.
##' @title prepare annotation from ENSEMBL
##' @param mart which version of ENSEMBL dataset to use. see useMart from package biomaRt for more detail.
##' @param annotation_path specify a folder to store all the annotations
##' @param dbsnp specify a snp dataset you want to use for the SNP annotation, default is NULL.
##' @param transcript_ids optionally, only retrieve transcript annotation data for the specified set of transcript ids
##' @param splice_matrix whether generate a known exon splice matrix from the annotation. this is not necessary if you don't want to analyse junction results, default is FALSE. 
##' @param COSMIC whether to download COSMIC data, default is FALSE.
##' @param ... additional arguments
##' @return several .RData file containing annotations needed for following analysis.
##' @author Xiaojing Wang
##' @examples
##' 
##' ensembl <- useMart("ENSEMBL_MART_ENSEMBL", dataset="hsapiens_gene_ensembl",
##' host="sep2015.archive.ensembl.org", path="/biomart/martservice", 
##' archive=FALSE)
##' 
##' annotation_path <- tempdir()
##' transcript_ids <- c("ENST00000234420", "ENST00000269305", "ENST00000445888", 
##'     "ENST00000257430", "ENST00000508376", "ENST00000288602", 
##'     "ENST00000269571", "ENST00000256078", "ENST00000384871")
##' 
##' PrepareAnnotationEnsembl(mart=ensembl, annotation_path=annotation_path, 
##'     splice_matrix=FALSE, dbsnp=NULL, transcript_ids=transcript_ids, 
##'     COSMIC=FALSE)
##' 
##' 



PrepareAnnotationEnsembl <- function(mart, annotation_path, splice_matrix=FALSE, 
                dbsnp=NULL, transcript_ids=NULL, COSMIC=FALSE,
 ...) {
    options(stringsAsFactors=FALSE)
    dataset <- biomaRt:::martDataset(mart)
    biomart <- biomaRt:::martBM(mart)
    host <- strsplit(strsplit(biomaRt:::martHost(mart), ':')[[1]][2], '//')[[1]][2]
    if (!is.null(dbsnp)) {
        session  <- browserSession()
        if(dataset == 'hsapiens_gene_ensembl') {
            if(host == 'may2009.archive.ensembl.org'){
                genome(session) <- 'hg18'
                dbsnps <- 'snp130'
            }else if(host == 'may2012.archive.ensembl.org'||host == 'dec2013.archive.ensembl.org'||host == 'feb2014.archive.ensembl.org'){
                genome(session) <- 'hg19'
                dbsnps <- trackNames(session)[grep('snp', trackNames(session), fixed=T)]
            }else{
				genome(session) <- 'hg38'
                dbsnps <- trackNames(session)[grep('snp', trackNames(session), fixed=T)]
			}
        }
        
        if(dataset == 'mmusculus_gene_ensembl') {
            if(host == 'may2009.archive.ensembl.org'||host == 'may2012.archive.ensembl.org'){
				genome(session) <- 'mm9'
                dbsnps <- 'snp128'
            }else{
				genome(session) <- 'mm10'
                dbsnps <- trackNames(session)[grep('snp', trackNames(session), fixed=T)]
            }
        }
        
        dbsnp <- pmatch(dbsnp, dbsnps)
        if (is.na(dbsnp)) 
                stop("invalid dbsnp name for specified genome")
        if (dbsnp == -1) 
                stop("ambiguous dbsnp name")
    }
    
    message("Prepare gene/transcript/protein id mapping information (ids.RData) ... ", 
            appendLF=FALSE)
    
    if(is.null(transcript_ids)){ 
        transcript_ids <- getBM(attributes=c("ensembl_transcript_id"), mart=mart)[,1]
        
    }
    attributes.id <- c("ensembl_gene_id", "hgnc_symbol", "description") 
    idstab <- getBM(attributes=attributes.id, mart=mart, 
                filters='ensembl_transcript_id', values=transcript_ids)
    colnames(idstab) <- c("ensembl_gene_id", "hgnc_symbol", "description") 
    
            idssum <- ddply(idstab, .(ensembl_gene_id), function(x) {
             new.x <- x[1, ]
             new.x$hgnc_symbol <- paste(x$hgnc_symbol, collapse=",")
             new.x
            })
    
    
    attributes.tr <- c("ensembl_gene_id", "ensembl_transcript_id",
        "ensembl_peptide_id")
    tr <- getBM(attributes=attributes.tr, mart=mart, 
                filters='ensembl_transcript_id', values=transcript_ids)
    colnames(tr) <- c("ensembl_gene_id", "ensembl_transcript_id", 
        "ensembl_peptide_id")
    ids <- merge(tr, idssum, by='ensembl_gene_id')
    description <- paste(ids[, 'hgnc_symbol'], ids[, 'description'], sep='|')
    ids <- cbind(ids[, 1:3], description)
    colnames(ids) <- c('gene_name', 'tx_name', 'pro_name', 'description')
    save(ids, file=paste(annotation_path, '/ids.RData', sep=''))
    
    packageStartupMessage(" done")
    
    message("Build TranscriptDB object (txdb.sqlite) ... ", appendLF=TRUE)
    tr_coding <- subset(ids, pro_name != "")
    tr_noncoding <- subset(ids, pro_name == "")
    
    txdb<- makeTranscriptDbFromBiomart_archive(biomart=biomart, dataset=dataset, 
            host=host, path="/biomart/martservice", archive=FALSE, 
            transcript_ids=transcript_ids)
    #saveFeatures(txdb, file=paste(annotation_path,'/txdb.sqlite',sep=''))
    saveDb(txdb, file=paste(annotation_path, '/txdb.sqlite', sep=''))
    packageStartupMessage(" done")
    #txdb_coding <- makeTranscriptDbFromBiomart_archive(biomart=biomart, 
    #   dataset=dataset, host=host, path="/biomart/martservice", archive=FALSE, 
    #   transcript_ids=tr_coding[, "tx_name"])
    #saveFeatures(txdb_coding, file=paste(annotation_path, '/txdb_coding.sqlite', sep=''))
    #saveDb(txdb_coding, file=paste(annotation_path, '/txdb_coding.sqlite', sep=''))

    #txdb_noncoding <- makeTranscriptDbFromBiomart_archive(biomart=biomart, 
    #    dataset =dataset, host=host, path="/biomart/martservice", 
    #    archive=FALSE, transcript_ids=tr_noncoding[, "tx_name"])
    #saveFeatures(txdb_noncoding, file=paste(annotation_path, '/txdb_noncoding.sqlite', sep=''))
    #saveDb(txdb_noncoding, file=paste(annotation_path, '/txdb_noncoding.sqlite', sep=''))

    transGrange <- transcripts(txdb)
    transintxdb <- IRanges::as.data.frame(transGrange)[, c('tx_id', 'tx_name')]
    
    
    message("Prepare exon annotation information (exon_anno.RData) ... ", 
            appendLF=FALSE)
    
    attributes.exon <- c("ensembl_exon_id", "ensembl_peptide_id", 'ensembl_gene_id', 
        "chromosome_name", "start_position", "end_position", "exon_chrom_start", 
        "exon_chrom_end", "strand", "5_utr_start", "5_utr_end", "3_utr_start", 
        "3_utr_end", "cds_start", "cds_end", "rank", "ensembl_transcript_id")
    exon <- getBM(attributes=attributes.exon, mart=mart, 
            filters='ensembl_transcript_id', values=transcript_ids)
    colnames(exon) <- attributes.exon
    exon <- merge(exon, transintxdb, by.x="ensembl_transcript_id", by.y="tx_name")
    
    colnames(exon) <- c("tx_name", "exon_name", "pro_name", "gene_name", 
        "chromosome_name", "start_position", "end_position", "exon_chrom_start", 
        "exon_chrom_end", "strand", "5_utr_start", "5_utr_end", "3_utr_start", 
        "3_utr_end", "cds_start", "cds_end", "rank", "tx_id")
    
    
    cdsByTx <- cdsBy(txdb, "tx", use.names=FALSE)
    cdss <-  IRanges::as.data.frame(cdsByTx)
    cds_chr_p <- data.frame(tx_id=cdss[, "group_name"],
                    cds_chr_start=cdss[, "start"], 
                    cds_chr_end=cdss[, "end"], rank=cdss[, "exon_rank"])
    
    
    cds_chr_p_coding <- subset(cds_chr_p, tx_id %in% exon[which(exon[, 'pro_name'] != ''), 'tx_id'])
    
    exon <- merge(exon, cds_chr_p_coding, by.y=c("tx_id", "rank"), 
            by.x=c("tx_id", "rank"), all.x=T)
    ###Ensembl use 1 & -1, change it to +/-
	exon[, 'strand'] <- unlist(lapply(exon[, 'strand'], function(x) 
			ifelse(x=='1', '+', '-')))
    
    save(exon,file=paste(annotation_path, '/exon_anno.RData', sep=''))
    packageStartupMessage(" done")
    
    message("Prepare protein coding sequence (procodingseq.RData)... ", 
            appendLF=FALSE)
    attributes.codingseq <- c("coding", "ensembl_peptide_id", 
            "ensembl_transcript_id") 
    if(length(tr_coding[, 'pro_name']<10000)){
        coding <- getBM(attributes=attributes.codingseq, 
                    filters="ensembl_peptide_id", values=tr_coding[, 'pro_name'], 
                    mart=mart)
    }else{ 
        index <- floor(length(tr_coding[, 'pro_name'])/10000)
        coding <- c()
        for(i in 1:index) {
            st <- (i-1)*10000+1
            ed <- i*10000
            tmp <- getBM(attributes=attributes.codingseq, filters="ensembl_peptide_id", 
                        values=tr_coding[st:ed, 'pro_name'], mart=mart)
            coding <- rbind(coding, tmp)
            #print(i)
        }
        tmp <- getBM(attributes=attributes.codingseq, filters="ensembl_peptide_id", 
                values=tr_coding[ed+1:length(tr_coding[, 'pro_name']), 'pro_name'], 
                mart=mart)
        coding <- rbind(coding, tmp)
    }
    colnames(coding) <- attributes.codingseq 
    tx_id <- transintxdb[match(coding[, 'ensembl_transcript_id'], 
                transintxdb[, 'tx_name']), 'tx_id']
    procodingseq <- cbind(coding, tx_id)
    colnames(procodingseq) <- c("coding", "pro_name", "tx_name", "tx_id")
    save(procodingseq,file=paste(annotation_path, '/procodingseq.RData', sep=''))
    packageStartupMessage(" done")
    
    message("Prepare protein sequence (proseq.RData) ... ", appendLF=FALSE)
    attributes.proseq <- c("peptide", "ensembl_peptide_id", "ensembl_transcript_id") 
    if(length(tr_coding[, 'pro_name']<10000)){
        proteinseq <- getBM(attributes=attributes.proseq, 
            filters="ensembl_peptide_id", values=tr_coding[,'pro_name'], mart=mart)
    }else{ 
        index <- floor(length(tr_coding[, 'pro_name'])/10000)
        proteinseq <- c()
        for(i in 1:index) {
            st <- (i-1)*10000+1
            ed <- i*10000
            tmp <- getBM(attributes=attributes.proseq, filters="ensembl_peptide_id", 
                    values= tr_coding[st:ed, 'pro_name'], mart=mart)
            proteinseq <- rbind(proteinseq, tmp)
            #print(i)
        }
        tmp <- getBM(attributes=attributes.proseq, filters="ensembl_peptide_id", 
            values=tr_coding[ed+1:length(tr_coding[, 'pro_name']), 'pro_name'], 
            mart=mart)
        proteinseq <- rbind(proteinseq, tmp)
    }
    colnames(proteinseq) <- c("peptide", "pro_name", "tx_name")
    save(proteinseq, file=paste(annotation_path, '/proteinseq.RData', sep=''))
    packageStartupMessage(" done")
    
    
    if (!is.null(dbsnp)) {
        
        message("Prepare dbSNP information (dbsnpinCoding.RData) ... ", 
            appendLF=FALSE)
        
        if(length(dbsnps) == 1&&dbsnps == 'snp128'){
            dbsnp_query <- ucscTableQuery(session, dbsnps[dbsnp], table='snp128')
        }else{
            dbsnp_query <- ucscTableQuery(session, dbsnps[dbsnp], 
                    table=paste(dbsnps[dbsnp], 'CodingDbSnp', sep=''))
        }
        snpCodingTab <- getTable(dbsnp_query)
        snpCodingTab[, 'chrom'] <- gsub('chr', '', snpCodingTab[, 'chrom'])
        chrlist <- paste(c(seq(1:22),'X','Y'))
        snpCoding <- subset(snpCodingTab, chrom %in% chrlist ,select=c(chrom:name, alleleCount, alleles))
        snpCoding <- unique(snpCoding)
        #snpCoding[, 'chrom'] <- gsub('chrM', 'MT', snpCoding[, 'chrom'])
        #
        
        #save(snpCoding,file=paste(annotation_path,'/snpcoding.RData',sep=''))
        snpCoding <- GRanges(seqnames=snpCoding[, 'chrom'], 
                    ranges=IRanges(start=snpCoding[, 'chromStart'], 
                    end=snpCoding[, 'chromEnd']), strand='*', 
                    rsid=snpCoding[, 'name'], alleleCount=snpCoding[, 'alleleCount'], 
                    alleles=snpCoding[, 'alleles'])
        
        #seqlevels(snpCoding)
        
        #if(TRUE%in% grep('chr',seqlevels(snpCoding)) > 0 ) {
        #    rchar <- sub('chr','',seqlevels(snpCoding))
        #    names(rchar) <- seqlevels(snpCoding)
        #    snpCoding <- renameSeqlevels(snpCoding, rchar) }
        #if('M'%in%seqlevels(snpCoding)) snpCoding <- renameSeqlevels(snpCoding, c( M='MT'))
        #chrlist <- paste(c(seq(1:22),'X','Y'))
        transGrange_snp <- transGrange
        #transGrange_snp <- keepSeqlevels(transGrange_snp, snpCoding)
        #snpCoding <- keepSeqlevels(snpCoding, transGrange_snp)
        
        #snpCoding <- keepSeqlevels(snpCoding, transGrange)
        
        dbsnpinCoding <- subsetByOverlaps(snpCoding,transGrange_snp)
        
        save(dbsnpinCoding,file=paste(annotation_path, '/dbsnpinCoding.RData', sep=''))
        packageStartupMessage(" done")
    
    }
    
    if (COSMIC) {
        message("Prepare COSMIC information (cosmic.RData) ... ", 
                appendLF=FALSE)
    
        cosmic_query <- ucscTableQuery(session, 'cosmic', table='cosmic')
        cosmicTab <- getTable(cosmic_query)
        cosmicTab[,'chrom'] <- gsub('chrM', 'MT', cosmicTab[, 'chrom'])
        cosmicTab[,'chrom'] <- gsub('chr', '', cosmicTab[, 'chrom']) 
        chrlist <- paste(c(seq(1:22),'X','Y','MT')) 
        cosmicTab <- subset(cosmicTab, chrom %in% chrlist)
        cosmic <- GRanges(seqnames=cosmicTab[, 'chrom'], 
                ranges=IRanges(start=cosmicTab[, 'chromStart'], 
                end=cosmicTab[, 'chromEnd']), strand = '*', 
                cosid=cosmicTab[, 'name'])   
        
        transGrange_cosmic <- transGrange 
        #transGrange_cosmic <- keepSeqlevels(transGrange_cosmic, cosmic)        
        #cosmic <- keepSeqlevels(cosmic, transGrange_cosmic)
        cosmic <- subsetByOverlaps(cosmic, transGrange_cosmic)
        
        save(cosmic,file=paste(annotation_path, '/cosmic.RData', sep=''))    
        packageStartupMessage(" done")        
    }
    
    
    if(splice_matrix){
        message("Prepare exon splice information (splicemax.RData) ... ", 
                appendLF=FALSE)
        exonByTx <- exonsBy(txdb, "tx", use.names=F)
        index <- which(elementNROWS(exonByTx)==1)
        exonByTx_mul <- exonByTx[-index]
        exons_mul <- IRanges::as.data.frame(exonByTx_mul)
        exonslist <- split(exons_mul, exons_mul$group)
       
        splicemax_list <- lapply(exonslist, .gen_splicmatrix)
        splicemax <- do.call(rbind, splicemax_list)
        save(splicemax, file=paste(annotation_path, '/splicemax.RData', sep=''))
        packageStartupMessage(" done")    
    }
    #splice <- paste(splicemax[, 1], splicemax[, 2], sep='-')
    
}

###########################################################################################
#generate splicing matrix
###########################################################
.gen_splicmatrix <- function(x, 
     ...) {           
         mystrand=x[1,'strand']
         a=x[,'exon_rank']
         b=x[,'exon_id']
         n=length(a)
         if (mystrand=='+'){
            tmp=order(a)
            
        }else{
            tmp=order(a,decreasing=T)
            
        }
        mm=cbind(b[tmp[1:(n-1)]], b[tmp[2:n]])
        mm
    }

Any scripts or data that you put into this service are public.

customProDB documentation built on Nov. 8, 2020, 8:06 p.m.

rdrr.io home R language documentation Run R code online

CRAN packages Bioconductor packages R-Forge packages GitHub packages

Note that we can't provide technical support on individual packages. You should contact the package authors for that.

customProDB
Generate customized protein database from NGS data, with a focus on RNA-Seq data, for proteomics search

R/PrepareAnnotationEnsembl.R
In customProDB: Generate customized protein database from NGS data, with a focus on RNA-Seq data, for proteomics search

Defines functions PrepareAnnotationEnsembl

Documented in PrepareAnnotationEnsembl

Try the customProDB package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

customProDB Generate customized protein database from NGS data, with a focus on RNA-Seq data, for proteomics search

R/PrepareAnnotationEnsembl.R In customProDB: Generate customized protein database from NGS data, with a focus on RNA-Seq data, for proteomics search

Defines functions PrepareAnnotationEnsembl

Documented in PrepareAnnotationEnsembl

Try the customProDB package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

customProDB
Generate customized protein database from NGS data, with a focus on RNA-Seq data, for proteomics search

R/PrepareAnnotationEnsembl.R
In customProDB: Generate customized protein database from NGS data, with a focus on RNA-Seq data, for proteomics search