R/PrepareAnnotationEnsembl.R

Defines functions PrepareAnnotationEnsembl

Documented in PrepareAnnotationEnsembl

##' prepare the annotation from ENSEMBL through biomaRt.
##'
##' this function automaticlly prepares all  annotation infromation needed in the following analysis.
##' @title prepare annotation from ENSEMBL
##' @param mart which version of ENSEMBL dataset to use. see useMart from package biomaRt for more detail.
##' @param annotation_path specify a folder to store all the annotations
##' @param dbsnp specify a snp dataset you want to use for the SNP annotation, default is NULL.
##' @param transcript_ids optionally, only retrieve transcript annotation data for the specified set of transcript ids
##' @param splice_matrix whether generate a known exon splice matrix from the annotation. this is not necessary if you don't want to analyse junction results, default is FALSE. 
##' @param COSMIC whether to download COSMIC data, default is FALSE.
##' @param ... additional arguments
##' @return several .RData file containing annotations needed for following analysis.
##' @author Xiaojing Wang
##' @examples
##' 
##' ensembl <- useMart("ENSEMBL_MART_ENSEMBL", dataset="hsapiens_gene_ensembl",
##' host="sep2015.archive.ensembl.org", path="/biomart/martservice", 
##' archive=FALSE)
##' 
##' annotation_path <- tempdir()
##' transcript_ids <- c("ENST00000234420", "ENST00000269305", "ENST00000445888", 
##'     "ENST00000257430", "ENST00000508376", "ENST00000288602", 
##'     "ENST00000269571", "ENST00000256078", "ENST00000384871")
##' 
##' PrepareAnnotationEnsembl(mart=ensembl, annotation_path=annotation_path, 
##'     splice_matrix=FALSE, dbsnp=NULL, transcript_ids=transcript_ids, 
##'     COSMIC=FALSE)
##' 
##' 



PrepareAnnotationEnsembl <- function(mart, annotation_path, splice_matrix=FALSE, 
                dbsnp=NULL, transcript_ids=NULL, COSMIC=FALSE,
 ...) {
    options(stringsAsFactors=FALSE)
    dataset <- biomaRt:::martDataset(mart)
    biomart <- biomaRt:::martBM(mart)
    host <- strsplit(strsplit(biomaRt:::martHost(mart), ':')[[1]][2], '//')[[1]][2]
    if (!is.null(dbsnp)) {
        session  <- browserSession()
        if(dataset == 'hsapiens_gene_ensembl') {
            if(host == 'may2009.archive.ensembl.org'){
                genome(session) <- 'hg18'
                dbsnps <- 'snp130'
            }else if(host == 'may2012.archive.ensembl.org'||host == 'dec2013.archive.ensembl.org'||host == 'feb2014.archive.ensembl.org'){
                genome(session) <- 'hg19'
                dbsnps <- trackNames(session)[grep('snp', trackNames(session), fixed=T)]
            }else{
				genome(session) <- 'hg38'
                dbsnps <- trackNames(session)[grep('snp', trackNames(session), fixed=T)]
			}
        }
        
        if(dataset == 'mmusculus_gene_ensembl') {
            if(host == 'may2009.archive.ensembl.org'||host == 'may2012.archive.ensembl.org'){
				genome(session) <- 'mm9'
                dbsnps <- 'snp128'
            }else{
				genome(session) <- 'mm10'
                dbsnps <- trackNames(session)[grep('snp', trackNames(session), fixed=T)]
            }
        }
        
        dbsnp <- pmatch(dbsnp, dbsnps)
        if (is.na(dbsnp)) 
                stop("invalid dbsnp name for specified genome")
        if (dbsnp == -1) 
                stop("ambiguous dbsnp name")
    }
    
    message("Prepare gene/transcript/protein id mapping information (ids.RData) ... ", 
            appendLF=FALSE)
    
    if(is.null(transcript_ids)){ 
        transcript_ids <- getBM(attributes=c("ensembl_transcript_id"), mart=mart)[,1]
        
    }
    attributes.id <- c("ensembl_gene_id", "hgnc_symbol", "description") 
    idstab <- getBM(attributes=attributes.id, mart=mart, 
                filters='ensembl_transcript_id', values=transcript_ids)
    colnames(idstab) <- c("ensembl_gene_id", "hgnc_symbol", "description") 
    
            idssum <- ddply(idstab, .(ensembl_gene_id), function(x) {
             new.x <- x[1, ]
             new.x$hgnc_symbol <- paste(x$hgnc_symbol, collapse=",")
             new.x
            })
    
    
    attributes.tr <- c("ensembl_gene_id", "ensembl_transcript_id",
        "ensembl_peptide_id")
    tr <- getBM(attributes=attributes.tr, mart=mart, 
                filters='ensembl_transcript_id', values=transcript_ids)
    colnames(tr) <- c("ensembl_gene_id", "ensembl_transcript_id", 
        "ensembl_peptide_id")
    ids <- merge(tr, idssum, by='ensembl_gene_id')
    description <- paste(ids[, 'hgnc_symbol'], ids[, 'description'], sep='|')
    ids <- cbind(ids[, 1:3], description)
    colnames(ids) <- c('gene_name', 'tx_name', 'pro_name', 'description')
    save(ids, file=paste(annotation_path, '/ids.RData', sep=''))
    
    packageStartupMessage(" done")
    
    message("Build TranscriptDB object (txdb.sqlite) ... ", appendLF=TRUE)
    tr_coding <- subset(ids, pro_name != "")
    tr_noncoding <- subset(ids, pro_name == "")
    
    txdb<- makeTranscriptDbFromBiomart_archive(biomart=biomart, dataset=dataset, 
            host=host, path="/biomart/martservice", archive=FALSE, 
            transcript_ids=transcript_ids)
    #saveFeatures(txdb, file=paste(annotation_path,'/txdb.sqlite',sep=''))
    saveDb(txdb, file=paste(annotation_path, '/txdb.sqlite', sep=''))
    packageStartupMessage(" done")
    #txdb_coding <- makeTranscriptDbFromBiomart_archive(biomart=biomart, 
    #   dataset=dataset, host=host, path="/biomart/martservice", archive=FALSE, 
    #   transcript_ids=tr_coding[, "tx_name"])
    #saveFeatures(txdb_coding, file=paste(annotation_path, '/txdb_coding.sqlite', sep=''))
    #saveDb(txdb_coding, file=paste(annotation_path, '/txdb_coding.sqlite', sep=''))

    #txdb_noncoding <- makeTranscriptDbFromBiomart_archive(biomart=biomart, 
    #    dataset =dataset, host=host, path="/biomart/martservice", 
    #    archive=FALSE, transcript_ids=tr_noncoding[, "tx_name"])
    #saveFeatures(txdb_noncoding, file=paste(annotation_path, '/txdb_noncoding.sqlite', sep=''))
    #saveDb(txdb_noncoding, file=paste(annotation_path, '/txdb_noncoding.sqlite', sep=''))

    transGrange <- transcripts(txdb)
    transintxdb <- IRanges::as.data.frame(transGrange)[, c('tx_id', 'tx_name')]
    
    
    message("Prepare exon annotation information (exon_anno.RData) ... ", 
            appendLF=FALSE)
    
    attributes.exon <- c("ensembl_exon_id", "ensembl_peptide_id", 'ensembl_gene_id', 
        "chromosome_name", "start_position", "end_position", "exon_chrom_start", 
        "exon_chrom_end", "strand", "5_utr_start", "5_utr_end", "3_utr_start", 
        "3_utr_end", "cds_start", "cds_end", "rank", "ensembl_transcript_id")
    exon <- getBM(attributes=attributes.exon, mart=mart, 
            filters='ensembl_transcript_id', values=transcript_ids)
    colnames(exon) <- attributes.exon
    exon <- merge(exon, transintxdb, by.x="ensembl_transcript_id", by.y="tx_name")
    
    colnames(exon) <- c("tx_name", "exon_name", "pro_name", "gene_name", 
        "chromosome_name", "start_position", "end_position", "exon_chrom_start", 
        "exon_chrom_end", "strand", "5_utr_start", "5_utr_end", "3_utr_start", 
        "3_utr_end", "cds_start", "cds_end", "rank", "tx_id")
    
    
    cdsByTx <- cdsBy(txdb, "tx", use.names=FALSE)
    cdss <-  IRanges::as.data.frame(cdsByTx)
    cds_chr_p <- data.frame(tx_id=cdss[, "group_name"],
                    cds_chr_start=cdss[, "start"], 
                    cds_chr_end=cdss[, "end"], rank=cdss[, "exon_rank"])
    
    
    cds_chr_p_coding <- subset(cds_chr_p, tx_id %in% exon[which(exon[, 'pro_name'] != ''), 'tx_id'])
    
    exon <- merge(exon, cds_chr_p_coding, by.y=c("tx_id", "rank"), 
            by.x=c("tx_id", "rank"), all.x=T)
    ###Ensembl use 1 & -1, change it to +/-
	exon[, 'strand'] <- unlist(lapply(exon[, 'strand'], function(x) 
			ifelse(x=='1', '+', '-')))
    
    save(exon,file=paste(annotation_path, '/exon_anno.RData', sep=''))
    packageStartupMessage(" done")
    
    message("Prepare protein coding sequence (procodingseq.RData)... ", 
            appendLF=FALSE)
    attributes.codingseq <- c("coding", "ensembl_peptide_id", 
            "ensembl_transcript_id") 
    if(length(tr_coding[, 'pro_name']<10000)){
        coding <- getBM(attributes=attributes.codingseq, 
                    filters="ensembl_peptide_id", values=tr_coding[, 'pro_name'], 
                    mart=mart)
    }else{ 
        index <- floor(length(tr_coding[, 'pro_name'])/10000)
        coding <- c()
        for(i in 1:index) {
            st <- (i-1)*10000+1
            ed <- i*10000
            tmp <- getBM(attributes=attributes.codingseq, filters="ensembl_peptide_id", 
                        values=tr_coding[st:ed, 'pro_name'], mart=mart)
            coding <- rbind(coding, tmp)
            #print(i)
        }
        tmp <- getBM(attributes=attributes.codingseq, filters="ensembl_peptide_id", 
                values=tr_coding[ed+1:length(tr_coding[, 'pro_name']), 'pro_name'], 
                mart=mart)
        coding <- rbind(coding, tmp)
    }
    colnames(coding) <- attributes.codingseq 
    tx_id <- transintxdb[match(coding[, 'ensembl_transcript_id'], 
                transintxdb[, 'tx_name']), 'tx_id']
    procodingseq <- cbind(coding, tx_id)
    colnames(procodingseq) <- c("coding", "pro_name", "tx_name", "tx_id")
    save(procodingseq,file=paste(annotation_path, '/procodingseq.RData', sep=''))
    packageStartupMessage(" done")
    
    message("Prepare protein sequence (proseq.RData) ... ", appendLF=FALSE)
    attributes.proseq <- c("peptide", "ensembl_peptide_id", "ensembl_transcript_id") 
    if(length(tr_coding[, 'pro_name']<10000)){
        proteinseq <- getBM(attributes=attributes.proseq, 
            filters="ensembl_peptide_id", values=tr_coding[,'pro_name'], mart=mart)
    }else{ 
        index <- floor(length(tr_coding[, 'pro_name'])/10000)
        proteinseq <- c()
        for(i in 1:index) {
            st <- (i-1)*10000+1
            ed <- i*10000
            tmp <- getBM(attributes=attributes.proseq, filters="ensembl_peptide_id", 
                    values= tr_coding[st:ed, 'pro_name'], mart=mart)
            proteinseq <- rbind(proteinseq, tmp)
            #print(i)
        }
        tmp <- getBM(attributes=attributes.proseq, filters="ensembl_peptide_id", 
            values=tr_coding[ed+1:length(tr_coding[, 'pro_name']), 'pro_name'], 
            mart=mart)
        proteinseq <- rbind(proteinseq, tmp)
    }
    colnames(proteinseq) <- c("peptide", "pro_name", "tx_name")
    save(proteinseq, file=paste(annotation_path, '/proteinseq.RData', sep=''))
    packageStartupMessage(" done")
    
    
    if (!is.null(dbsnp)) {
        
        message("Prepare dbSNP information (dbsnpinCoding.RData) ... ", 
            appendLF=FALSE)
        
        if(length(dbsnps) == 1&&dbsnps == 'snp128'){
            dbsnp_query <- ucscTableQuery(session, dbsnps[dbsnp], table='snp128')
        }else{
            dbsnp_query <- ucscTableQuery(session, dbsnps[dbsnp], 
                    table=paste(dbsnps[dbsnp], 'CodingDbSnp', sep=''))
        }
        snpCodingTab <- getTable(dbsnp_query)
        snpCodingTab[, 'chrom'] <- gsub('chr', '', snpCodingTab[, 'chrom'])
        chrlist <- paste(c(seq(1:22),'X','Y'))
        snpCoding <- subset(snpCodingTab, chrom %in% chrlist ,select=c(chrom:name, alleleCount, alleles))
        snpCoding <- unique(snpCoding)
        #snpCoding[, 'chrom'] <- gsub('chrM', 'MT', snpCoding[, 'chrom'])
        #
        
        #save(snpCoding,file=paste(annotation_path,'/snpcoding.RData',sep=''))
        snpCoding <- GRanges(seqnames=snpCoding[, 'chrom'], 
                    ranges=IRanges(start=snpCoding[, 'chromStart'], 
                    end=snpCoding[, 'chromEnd']), strand='*', 
                    rsid=snpCoding[, 'name'], alleleCount=snpCoding[, 'alleleCount'], 
                    alleles=snpCoding[, 'alleles'])
        
        #seqlevels(snpCoding)
        
        #if(TRUE%in% grep('chr',seqlevels(snpCoding)) > 0 ) {
        #    rchar <- sub('chr','',seqlevels(snpCoding))
        #    names(rchar) <- seqlevels(snpCoding)
        #    snpCoding <- renameSeqlevels(snpCoding, rchar) }
        #if('M'%in%seqlevels(snpCoding)) snpCoding <- renameSeqlevels(snpCoding, c( M='MT'))
        #chrlist <- paste(c(seq(1:22),'X','Y'))
        transGrange_snp <- transGrange
        #transGrange_snp <- keepSeqlevels(transGrange_snp, snpCoding)
        #snpCoding <- keepSeqlevels(snpCoding, transGrange_snp)
        
        #snpCoding <- keepSeqlevels(snpCoding, transGrange)
        
        dbsnpinCoding <- subsetByOverlaps(snpCoding,transGrange_snp)
        
        save(dbsnpinCoding,file=paste(annotation_path, '/dbsnpinCoding.RData', sep=''))
        packageStartupMessage(" done")
    
    }
    
    if (COSMIC) {
        message("Prepare COSMIC information (cosmic.RData) ... ", 
                appendLF=FALSE)
    
        cosmic_query <- ucscTableQuery(session, 'cosmic', table='cosmic')
        cosmicTab <- getTable(cosmic_query)
        cosmicTab[,'chrom'] <- gsub('chrM', 'MT', cosmicTab[, 'chrom'])
        cosmicTab[,'chrom'] <- gsub('chr', '', cosmicTab[, 'chrom']) 
        chrlist <- paste(c(seq(1:22),'X','Y','MT')) 
        cosmicTab <- subset(cosmicTab, chrom %in% chrlist)
        cosmic <- GRanges(seqnames=cosmicTab[, 'chrom'], 
                ranges=IRanges(start=cosmicTab[, 'chromStart'], 
                end=cosmicTab[, 'chromEnd']), strand = '*', 
                cosid=cosmicTab[, 'name'])   
        
        transGrange_cosmic <- transGrange 
        #transGrange_cosmic <- keepSeqlevels(transGrange_cosmic, cosmic)        
        #cosmic <- keepSeqlevels(cosmic, transGrange_cosmic)
        cosmic <- subsetByOverlaps(cosmic, transGrange_cosmic)
        
        save(cosmic,file=paste(annotation_path, '/cosmic.RData', sep=''))    
        packageStartupMessage(" done")        
    }
    
    
    if(splice_matrix){
        message("Prepare exon splice information (splicemax.RData) ... ", 
                appendLF=FALSE)
        exonByTx <- exonsBy(txdb, "tx", use.names=F)
        index <- which(elementNROWS(exonByTx)==1)
        exonByTx_mul <- exonByTx[-index]
        exons_mul <- IRanges::as.data.frame(exonByTx_mul)
        exonslist <- split(exons_mul, exons_mul$group)
       
        splicemax_list <- lapply(exonslist, .gen_splicmatrix)
        splicemax <- do.call(rbind, splicemax_list)
        save(splicemax, file=paste(annotation_path, '/splicemax.RData', sep=''))
        packageStartupMessage(" done")    
    }
    #splice <- paste(splicemax[, 1], splicemax[, 2], sep='-')
    
}

###########################################################################################
#generate splicing matrix
###########################################################
.gen_splicmatrix <- function(x, 
     ...) {           
         mystrand=x[1,'strand']
         a=x[,'exon_rank']
         b=x[,'exon_id']
         n=length(a)
         if (mystrand=='+'){
            tmp=order(a)
            
        }else{
            tmp=order(a,decreasing=T)
            
        }
        mm=cbind(b[tmp[1:(n-1)]], b[tmp[2:n]])
        mm
    }

Try the customProDB package in your browser

Any scripts or data that you put into this service are public.

customProDB documentation built on Nov. 8, 2020, 8:06 p.m.