R/metaseqr.annotation.R

Defines functions correct.transcripts get.transcript.attributes get.transcript.utr.attributes get.exon.attributes get.gene.attributes get.valid.chrs get.dataset get.alt.host get.host get.biotypes load.bs.genome get.bs.organism get.ucsc.organism get.gc.content get.ucsc.annotation get.ensembl.annotation get.annotation

Documented in get.annotation get.biotypes get.bs.organism get.dataset get.ensembl.annotation get.exon.attributes get.gc.content get.gene.attributes get.host get.transcript.utr.attributes get.ucsc.annotation get.ucsc.organism get.valid.chrs load.bs.genome

#' Annotation downloader
#'
#' This function connects to the EBI's Biomart service using the package biomaRt
#' and downloads annotation elements (gene co-ordinates, exon co-ordinates, gene
#' identifications, biotypes etc.) for each of the supported organisms. See the
#' help page of \code{\link{metaseqr}} for a list of supported organisms. The
#' function downloads annotation for an organism genes or exons.
#'
#' @param org the organism for which to download annotation.
#' @param type either \code{"gene"} or \code{"exon"}.
#' @param refdb the online source to use to fetch annotation. It can be
#' \code{"ensembl"} (default), \code{"ucsc"} or \code{"refseq"}. In the later two
#' cases, an SQL connection is opened with the UCSC public databases.
#' @param multic a logical value indicating the presence of multiple cores. Defaults
#' to \code{FALSE}. Do not change it if you are not sure whether package parallel
#' has been loaded or not. It is used in the case of \code{type="exon"} to process
#' the return value of the query to the UCSC Genome Browser database.
#' @return A data frame with the canonical (not isoforms!) genes or exons of the
#' requested organism. When \code{type="genes"}, the data frame has the following
#' columns: chromosome, start, end, gene_id, gc_content, strand, gene_name, biotype.
#' When \code{type="exon"} the data frame has the following columns: chromosome,
#' start, end, exon_id, gene_id, strand, gene_name, biotype. The gene_id and exon_id
#' correspond to Ensembl gene and exon accessions respectively. The gene_name
#' corresponds to HUGO nomenclature gene names.
#' @note The data frame that is returned contains only "canonical" chromosomes
#' for each organism. It does not contain haplotypes or random locations and does
#' not contain chromosome M.
#' @export
#' @author Panagiotis Moulos
#' @examples
#' \dontrun{
#' hg19.genes <- get.annotation("hg19","gene","ensembl")
#' mm9.exons <- get.annotation("mm9","exon","ucsc")
#'}
get.annotation <- function(org,type,refdb="ensembl",multic=FALSE) {
    org <- tolower(org)
    if (type %in% c("utr","transcript") && refdb %in% c("ucsc","refseq")) {
        disp("Quant-Seq (utr) and transcript analysis is not yet supported ",
            "with UCSC or RefSeq annotation. Switching to Ensembl...")
        refdb <- "ensembl"
    }
    switch(refdb,
        ensembl = { return(get.ensembl.annotation(org,type)) },
        ucsc = { return(get.ucsc.annotation(org,type,refdb,multic)) },
        refseq = { return(get.ucsc.annotation(org,type,refdb,multic)) }
    )
}

#' Ensembl annotation downloader
#'
#' This function connects to the EBI's Biomart service using the package biomaRt
#' and downloads annotation elements (gene co-ordinates, exon co-ordinates, gene
#' identifications, biotypes etc.) for each of the supported organisms. See the
#' help page of \code{\link{metaseqr}} for a list of supported organisms. The
#' function downloads annotation for an organism genes or exons.
#'
#' @param org the organism for which to download annotation.
#' @param type either \code{"gene"} or \code{"exon"}.
#' @return A data frame with the canonical (not isoforms!) genes or exons of the
#' requested organism. When \code{type="genes"}, the data frame has the following
#' columns: chromosome, start, end, gene_id, gc_content, strand, gene_name, biotype.
#' When \code{type="exon"} the data frame has the following columns: chromosome,
#' start, end, exon_id, gene_id, strand, gene_name, biotype. The gene_id and exon_id
#' correspond to Ensembl gene and exon accessions respectively. The gene_name
#' corresponds to HUGO nomenclature gene names.
#' @note The data frame that is returned contains only "canonical" chromosomes
#' for each organism. It does not contain haplotypes or random locations and does
#' not contain chromosome M.
#' @export
#' @author Panagiotis Moulos
#' @examples
#' \dontrun{
#' hg19.genes <- get.ensembl.annotation("hg19","gene")
#' mm9.exons <- get.ensembl.annotation("mm9","exon")
#'}
get.ensembl.annotation <- function(org,type) {
    if (org=="tair10")
        dat <- "plants_mart"
    else if (org=="bmori2")
        dat <- "metazoa_mart"
    else
        dat <- "ENSEMBL_MART_ENSEMBL"
        
    mart <- tryCatch({
            useMart(biomart=dat,host=get.host(org),dataset=get.dataset(org))
        },
        error=function(e) {
            useMart(biomart=dat,host=get.alt.host(org),
            dataset=get.dataset(org))
        },
        finally={})
    chrs.exp <- paste(get.valid.chrs(org),collapse="|")
    if (type=="gene") {
        bm <- getBM(attributes=get.gene.attributes(org),mart=mart)
        ann <- data.frame(
            chromosome=paste("chr",bm$chromosome_name,sep=""),
            start=bm$start_position,
            end=bm$end_position,
            gene_id=bm$ensembl_gene_id,
            gc_content=if (org %in% c("hg18","mm9","tair10")) 
				bm$percentage_gc_content else bm$percentage_gene_gc_content,
            strand=ifelse(bm$strand==1,"+","-"),
            gene_name=if (org %in% c("hg18","mm9","tair10")) bm$external_gene_id 
                else bm$external_gene_name,
            biotype=bm$gene_biotype
        )
        rownames(ann) <- ann$gene_id
    }
    else if (type=="exon") {
        bm <- getBM(attributes=get.exon.attributes(org),mart=mart)
        if (org == "hg19") {
            disp("  Bypassing problem with hg19 Ensembl combined gene-exon ",
                "annotation... Will take slightly longer...")
            bmg <- getBM(attributes=get.gene.attributes(org),mart=mart)
            gene_name <- bmg$external_gene_name
            names(gene_name) <- bmg$ensembl_gene_id
            ann <- data.frame(
                chromosome=paste("chr",bm$chromosome_name,sep=""),
                start=bm$exon_chrom_start,
                end=bm$exon_chrom_end,
                exon_id=bm$ensembl_exon_id,
                gene_id=bm$ensembl_gene_id,
                strand=ifelse(bm$strand==1,"+","-"),
                gene_name=gene_name[bm$ensembl_gene_id],
                biotype=bm$gene_biotype
            )
            rownames(ann) <- ann$exon_id
        }
        else
            ann <- data.frame(
                chromosome=paste("chr",bm$chromosome_name,sep=""),
                start=bm$exon_chrom_start,
                end=bm$exon_chrom_end,
                exon_id=bm$ensembl_exon_id,
                gene_id=bm$ensembl_gene_id,
                strand=ifelse(bm$strand==1,"+","-"),
                gene_name=if (org %in% c("hg18","mm9","tair10")) 
                    bm$external_gene_id else bm$external_gene_name,
                biotype=bm$gene_biotype
            )
            rownames(ann) <- ann$exon_id
    }
    else if (type=="utr") {
        bm <- getBM(attributes=get.transcript.utr.attributes(org),mart=mart)
        ann <- data.frame(
            chromosome=paste("chr",bm$chromosome_name,sep=""),
            start=bm$`3_utr_start`,
            end=bm$`3_utr_end`,
            tstart=bm$transcript_start,
            tend=bm$transcript_end,
            transcript_id=bm$ensembl_transcript_id,
            gene_id=bm$ensembl_gene_id,
            strand=ifelse(bm$strand==1,"+","-"),
            gene_name=if (org %in% c("hg18","mm9","tair10")) 
                bm$external_gene_id else bm$external_gene_name,
            biotype=bm$gene_biotype
        )
        ann <- correct.transcripts(ann)
        ann <- ann[,c("chromosome","start","end","transcript_id","gene_id",
            "strand","gene_name","biotype")]
    }
    else if (type=="transcript") {
        bm <- getBM(attributes=get.transcript.attributes(org),mart=mart)
        ann <- data.frame(
            chromosome=paste("chr",bm$chromosome_name,sep=""),
            start=bm$transcript_start,
            end=bm$transcript_end,
            transcript_id=bm$ensembl_transcript_id,
            gene_id=bm$ensembl_gene_id,
            strand=ifelse(bm$strand==1,"+","-"),
            gene_name=if (org %in% c("hg18","mm9","tair10")) 
                bm$external_gene_id else bm$external_gene_name,
            biotype=bm$gene_biotype
        )
        rownames(ann) <- as.character(ann$transcript_id)
    }
    ann <- ann[order(ann$chromosome,ann$start),]
    ann <- ann[grep(chrs.exp,ann$chromosome),]
    ann$chromosome <- as.character(ann$chromosome)
    return(ann)
}

#' UCSC/RefSeq annotation downloader
#'
#' This function connects to the UCSC Genome Browser public database and downloads
#' annotation elements (gene co-ordinates, exon co-ordinates, gene identifications
#' etc.) for each of the supported organisms, but using UCSC instead of Ensembl.
#' See the help page of \code{\link{metaseqr}} for a list of supported organisms.
#' The function downloads annotation for an organism genes or exons.
#'
#' @param org the organism for which to download annotation.
#' @param type either \code{"gene"} or \code{"exon"}.
#' @param refdb either \code{"ucsc"} or \code{"refseq"}.
#' @param multic a logical value indicating the presence of multiple cores. Defaults
#' to \code{FALSE}. Do not change it if you are not sure whether package parallel
#' has been loaded or not. It is used in the case of \code{type="exon"} to process
#' the return value of the query to the UCSC Genome Browser database.
#' @return A data frame with the canonical (not isoforms!) genes or exons of the
#' requested organism. When \code{type="genes"}, the data frame has the following
#' columns: chromosome, start, end, gene_id, gc_content, strand, gene_name, biotype.
#' When \code{type="exon"} the data frame has the following columns: chromosome,
#' start, end, exon_id, gene_id, strand, gene_name, biotype. The gene_id and exon_id
#' correspond to UCSC or RefSeq gene and exon accessions respectively. The gene_name
#' corresponds to HUGO nomenclature gene names.
#' @note The data frame that is returned contains only "canonical" chromosomes
#' for each organism. It does not contain haplotypes or random locations and does
#' not contain chromosome M. Note also that as the UCSC databases do not contain
#' biotype classification like Ensembl, this will be returned as \code{NA} and
#' as a result, some quality control plots will not be available.
#' @export
#' @author Panagiotis Moulos
#' @examples
#' \dontrun{
#' hg19.genes <- get.ucsc.annotation("hg19","gene","ucsc")
#' mm9.exons <- get.ucsc.annotation("mm9","exon")
#'}
get.ucsc.annotation <- function(org,type,refdb="ucsc",multic=FALSE) {
    if (org=="bmori2") {
        warnwrap("Bombyx mori (silkworm) annotation is not supported by UCSC ",
            "or RefSeq! Will use Ensembl...")
        return(get.ensembl.annotation("bmori2",type))
    }
    if (!require(RMySQL)) {
        rmysql.present <- FALSE
        warnwrap("R package RMySQL is not present! Annotation will be ",
            "retrieved by downloading temporary files from UCSC and the usage
            of a temporary SQLite database...")
    }
    else
        rmysql.present <- TRUE
    if (!require(RSQLite))
        stopwrap("R package RSQLite is required to use annotation from UCSC!")

    if (org=="tair10") {
        warnwrap("Arabidopsis thaliana genome is not supported by UCSC Genome ",
            "Browser database! Switching to Ensembl...")
        return(get.ensembl.annotation("tair10",type))
    }
    if (org=="equcab2") {
        warnwrap("Equus cabalus genome is not supported by UCSC Genome ",
            "Browser database! Switching to Ensembl...")
        return(get.ensembl.annotation("equcab2",type))
    }

    valid.chrs <- get.valid.chrs(org)
    chrs.exp <- paste("^",paste(valid.chrs,collapse="$|^"),"$",sep="")

    db.org <- get.ucsc.organism(org)
    if (rmysql.present) {
        db.creds <- get.ucsc.credentials()
        drv <- dbDriver("MySQL")
        con <- dbConnect(drv,user=db.creds[2],password=NULL,dbname=db.org,
            host=db.creds[1])
        query <- get.ucsc.query(org,type,refdb)
        raw.ann <- dbGetQuery(con,query)
        dbDisconnect(con)
    }
    else {
        # This should return the same data frame as the db query
        tmp.sqlite <- get.ucsc.dbl(org,type,refdb)
        drv <- dbDriver("SQLite")
        con <- dbConnect(drv,dbname=tmp.sqlite)
        query <- get.ucsc.query(org,type,refdb)
        raw.ann <- dbGetQuery(con,query)
        dbDisconnect(con)
    }
    if (type=="gene") {
        ann <- raw.ann
        ann <- ann[grep(chrs.exp,ann$chromosome,perl=TRUE),]
        ann$chromosome <- as.character(ann$chromosome)
        rownames(ann) <- ann$gene_id
    }
    else if (type=="exon") {
        raw.ann <- raw.ann[grep(chrs.exp,raw.ann$chromosome,perl=TRUE),]
        ex.list <- wapply(multic,as.list(1:nrow(raw.ann)),function(x,d,s) {
            r <- d[x,]
            starts <- as.numeric(strsplit(r[,"start"],",")[[1]])
            ends <- as.numeric(strsplit(r[,"end"],",")[[1]])
            nexons <- length(starts)
            ret <- data.frame(
                rep(r[,"chromosome"],nexons),
                starts,ends,
                paste(r[,"exon_id"],"_e",1:nexons,sep=""),
                rep(r[,"strand"],nexons),
                rep(r[,"gene_id"],nexons),
                rep(r[,"gene_name"],nexons),
                rep(r[,"biotype"],nexons)
            )
            names(ret) <- names(r)
            rownames(ret) <- ret$exon_id
            ret <- makeGRangesFromDataFrame(
                df=ret,
                keep.extra.columns=TRUE,
                seqnames.field="chromosome",
                seqinfo=s
            )
            return(ret)
        },raw.ann,valid.chrs)
        tmp.ann <- do.call("c",ex.list)
        ann <- data.frame(
            chromosome=as.character(seqnames(tmp.ann)),
            start=start(tmp.ann),
            end=end(tmp.ann),
            exon_id=as.character(tmp.ann$exon_id),
            gene_id=as.character(tmp.ann$gene_id),
            strand=as.character(strand(tmp.ann)),
            gene_name=as.character(tmp.ann$gene_name),
            biotype=tmp.ann$biotype
        )
        rownames(ann) <- ann$exon_id
    }
    
    gc.content <- get.gc.content(ann,org)
    ann$gc_content <- gc.content
    ann <- ann[order(ann$chromosome,ann$start),]
    return(ann)
}

#' Return a named vector of GC-content for each genomic region
#'
#' Returns a named numeric vector (names are the genomic region names, e.g. genes)
#' given a data frame which can be converted to a GRanges object (e.g. it has at
#' least chromosome, start, end fields). This function works best when the input
#' annotation data frame has been retrieved using one of the SQL queries generated
#' from \code{\link{get.ucsc.query}}, used in \code{\link{get.ucsc.annotation}}.
#'
#' @param ann a data frame which can be converted to a GRanges object, that means
#' it has at least the chromosome, start, end fields. Preferably, the output of
#' \code{link{get.ucsc.annotation}}.
#' @param org one of metaseqR supported organisms.
#' @return A named numeric vector.
#' @export
#' @author Panagiotis Moulos
#' @examples
#' \dontrun{
#' ann <- get.ucsc.annotation("mm9","gene","ucsc")
#' gc <- get.gc.content(ann,"mm9")
#'}
get.gc.content <- function(ann,org) {
    if (missing(ann))
        stopwrap("A valid annotation data frame must be provided in order to ",
            "retrieve GC-content.")
    org <- tolower(org[1])
    check.text.args("org",org,c("hg18","hg19","hg38","mm9","mm10","rn5","dm3",
		"dm6","danrer7","pantro4","susscr3","tair10"),multiarg=FALSE)
    # Convert annotation to GRanges
    disp("Converting annotation to GenomicRanges object...")
    if (packageVersion("GenomicRanges")<1.14)
        ann.gr <- GRanges(
            seqnames=Rle(ann[,1]),
            ranges=IRanges(start=ann[,2],end=ann[,3]),
            strand=Rle(ann[,6]),
            name=as.character(ann[,4])
        )
    else
        ann.gr <- makeGRangesFromDataFrame(
            df=ann,
            keep.extra.columns=TRUE,
            seqnames.field="chromosome"
        )
    bsg <- load.bs.genome(org)
    disp("Getting DNA sequences...")
    seqs <- getSeq(bsg,names=ann.gr)
    disp("Getting GC content...")
    freq.matrix <- alphabetFrequency(seqs,as.prob=TRUE,baseOnly=TRUE)
    gc.content <- apply(freq.matrix,1,function(x) round(100*sum(x[2:3]),
        digits=2))
    names(gc.content) <- as.character(ann[,4])
    return(gc.content)
}

#' Return a proper formatted organism alias
#'
#' Returns the proper UCSC Genome Browser database organism alias based on what is
#' given to metaseqR. Internal use.
#'
#' @return A proper organism alias.
#' @author Panagiotis Moulos
#' @examples
#' \dontrun{
#' org <- get.ucsc.organism("danrer7")
#'}
get.ucsc.organism <- function(org) {
    switch(org,
        hg18 = { return("hg18") },
        hg19 = { return("hg19") },
        hg38 = { return("hg38") },
        mm9 = { return("mm9") },
        mm10 = { return("mm10") },
        rn5 = { return("rn5") },
        rn6 = { return("rn6") },
        dm3 = { return("dm3") },
        dm6 = { return("dm6") },
        danrer7 = { return("danRer7") },
        pantro4 = { return("panTro4") },
        susscr3 = { return("susScr3") },
        tair10 = { return("TAIR10") },
        equcab2 = { return("equCab2") }
    )
}

#' Return a proper formatted BSgenome organism name
#'
#' Returns a properly formatted BSgenome package name according to metaseqR's
#' supported organism. Internal use.
#'
#' @return A proper BSgenome package name.
#' @author Panagiotis Moulos
#' @examples
#' \dontrun{
#' bs.name <- get.bs.organism("hg18")
#'}
get.bs.organism <- function(org) {
    switch(org,
        hg18 = {
            return("BSgenome.Hsapiens.UCSC.hg18")
        },
        hg19 = {
            return("BSgenome.Hsapiens.UCSC.hg19")
        },
        hg38 = {
            return("BSgenome.Hsapiens.UCSC.hg38")
        },
        mm9 = {
            return("BSgenome.Mmusculus.UCSC.mm9")
        },
        mm10 = {
            return("BSgenome.Mmusculus.UCSC.mm10")
        },
        rn5 = {
            return("BSgenome.Rnorvegicus.UCSC.rn5")
        },
        rn6 = {
            return("BSgenome.Rnorvegicus.UCSC.rn6")
        },
        dm3 = {
            return("BSgenome.Dmelanogaster.UCSC.dm3")
        },
        dm6 = {
            return("BSgenome.Dmelanogaster.UCSC.dm6")
        },
        danrer7 = {
            return("BSgenome.Drerio.UCSC.danRer7")
        },
        pantro4 = {
            stopwrap("panTro4 is not yet supported by BSgenome! Please use ",
                "Ensembl as annoation source.")
        },
        susscr3 = {
            return("BSgenome.Sscrofa.UCSC.susScr3")
        },
        tair10 = {
            stopwrap("TAIR10 is not yet supported by BSgenome! Please use ",
                "Ensembl as annoation source.")
        }
    )
}

#' Loads (or downloads) the required BSGenome package
#'
#' Retrieves the required BSgenome package when the annotation source is \code{"ucsc"}
#' or \code{"refseq"}. These packages are required in order to estimate the
#' GC-content of the retrieved genes from UCSC or RefSeq.
#'
#' @param org one of \code{\link{metaseqr}} supported organisms.
#' @return The BSgenome object for the requested organism.
#' @export
#' @author Panagiotis Moulos
#' @examples
#' \dontrun{
#' bs.obj <- load.bs.genome("mm9")
#'}
load.bs.genome <- function(org) {
    if (!require(BiocManager))
        stopwrap("The Bioconductor package BiocManager is required to ",
            "proceed!")
    if (!require(BSgenome))
        stopwrap("The Bioconductor package BSgenome is required to ",
            "proceed!")
    bs.org <- get.bs.organism(org)
    if (bs.org %in% installed.genomes())
        bs.obj <- getBSgenome(get.ucsc.organism(org))
    else {
        BiocManager::install(bs.org)
        bs.obj <- getBSgenome(get.ucsc.organism(org))
    }
    return(bs.obj)
}

#' Biotype converter
#'
#' Returns biotypes as character vector. Internal use.
#'
#' @param a the annotation data frame (output of \code{\link{get.annotation}}).
#' @return A character vector of biotypes.
#' @export
#' @author Panagiotis Moulos
#' @examples
#' \dontrun{
#' hg18.genes <- get.annotation("hg18","gene")
#' hg18.bt <- get.biotypes(hg18.genes)
#'}
get.biotypes <- function(a) {
    return(as.character(unique(a$biotype)))
}

#' Annotation downloader helper
#'
#' Returns the appropriate Ensembl host address to get different versions of
#' annotation from. Internal use.
#'
#' @param org the organism for which to return the host address.
#' @return A string with the host address.
#' @export
#' @author Panagiotis Moulos
#' @examples
#' \dontrun{
#' mm9.host <- get.host("mm9")
#'}
get.host <- function(org) {
    switch(org,
        hg18 = { return("may2009.archive.ensembl.org") },
        hg19 = { return("grch37.ensembl.org") },
        hg38 = { return("www.ensembl.org") },
        mm9 = { return("may2012.archive.ensembl.org") },
        mm10 = { return("www.ensembl.org") },
        rn5 = { return("may2012.archive.ensembl.org") },
        rn6 = { return("www.ensembl.org") },
        dm3 = { return("grch37.ensembl.org") },
        dm6 = { return("www.ensembl.org") },
        danrer7 = { return("www.ensembl.org") },
        pantro4 = { return("www.ensembl.org") },
        susscr3 = { return("www.ensembl.org") },
        tair10 = { return("plants.ensembl.org") },
        equcab2 = { return("www.ensembl.org") },
        bmori2 = { return("metazoa.ensembl.org") }
    )
}

#' Annotation downloader helper
#'
#' Returns the appropriate Ensembl host address to get different versions of
#' annotation from (alternative hosts). Internal use.
#'
#' @param org the organism for which to return the host address.
#' @return A string with the host address.
#' @author Panagiotis Moulos
#' @examples
#' \dontrun{
#' mm9.host <- get.alt.host("mm9")
#'}
get.alt.host <- function(org) {
    switch(org,
        hg18 = { return("may2009.archive.ensembl.org") },
        hg19 = { return("grch37.ensembl.org") },
        hg38 = { return("uswest.ensembl.org") },
        mm9 = { return("may2012.archive.ensembl.org") },
        mm10 = { return("uswest.ensembl.org") },
        rn5 = { return("uswest.ensembl.org") },
        dm3 = { return("grch37.ensembl.org") },
        dm6 = { return("uswest.ensembl.org") },
        danrer7 = { return("uswest.ensembl.org") },
        pantro4 = { return("uswest.ensembl.org") },
        susscr3 = { return("uswest.ensembl.org") },
        tair10 = { return("plants.ensembl.org") },
        equcab2 = { return("uswest.ensembl.org") },
        bmori2 = { return("metazoa.ensembl.org") }
    )
}

#' Annotation downloader helper
#'
#' Returns a dataset (gene or exon) identifier for each organism recognized by
#' the Biomart service for Ensembl. Internal use.
#'
#' @param org the organism for which to return the identifier.
#' @return A string with the dataset identifier.
#' @export
#' @author Panagiotis Moulos
#' @examples
#' \dontrun{
#' dm6.id <- get.dataset("dm6")
#'}
get.dataset <- function(org) {
    switch(org,
        hg18 = { return("hsapiens_gene_ensembl") },
        hg19 = { return("hsapiens_gene_ensembl") },
        hg38 = { return("hsapiens_gene_ensembl") },
        mm9 = { return("mmusculus_gene_ensembl") },
        mm10 = { return("mmusculus_gene_ensembl") },
        rn5 = { return("rnorvegicus_gene_ensembl") },
        rn6 = { return("rnorvegicus_gene_ensembl") },
        dm3 = { return("dmelanogaster_gene_ensembl") },
        dm6 = { return("dmelanogaster_gene_ensembl") },
        danrer7 = { return("drerio_gene_ensembl") },
        pantro4 = { return("ptroglodytes_gene_ensembl") },
        susscr3 = { return("sscrofa_gene_ensembl") },
        tair10 = { return("athaliana_eg_gene") },
        equcab2 = { return("ecaballus_gene_ensembl") },
        bmori2 = { return("bmori_eg_gene") },
    )
}

#' Annotation downloader helper
#'
#' Returns a vector of chromosomes to maintain after annotation download. Internal
#' use.
#'
#' @param org the organism for which to return the chromosomes. 
#' @return A character vector of chromosomes.
#' @export
#' @author Panagiotis Moulos
#' @examples
#' \dontrun{
#' hg18.chr <- get.valid.chrs("hg18")
#'}
get.valid.chrs <- function(org)
{
    switch(org,
        hg18 = {
            return(c(
                "chr1","chr10","chr11","chr12","chr13","chr14","chr15","chr16",
                "chr17","chr18","chr19","chr2","chr20","chr21","chr22","chr3",
                "chr4","chr5","chr6","chr7","chr8","chr9","chrX","chrY"
            ))
        },
        hg19 = {
            return(c(
                "chr1","chr10","chr11","chr12","chr13","chr14","chr15","chr16",
                "chr17","chr18","chr19","chr2","chr20","chr21","chr22","chr3",
                "chr4","chr5","chr6","chr7","chr8","chr9","chrX","chrY"
            ))
        },
        hg38 = {
            return(c(
                "chr1","chr10","chr11","chr12","chr13","chr14","chr15","chr16",
                "chr17","chr18","chr19","chr2","chr20","chr21","chr22","chr3",
                "chr4","chr5","chr6","chr7","chr8","chr9","chrX","chrY"
            ))
        },
        mm9 = {
            return(c(
                "chr1","chr10","chr11","chr12","chr13","chr14","chr15","chr16",
                "chr17","chr18","chr19","chr2","chr3","chr4","chr5","chr6",
                "chr7","chr8","chr9","chrX","chrY"
            ))
        },
        mm10 = {
            return(c(
                "chr1","chr10","chr11","chr12","chr13","chr14","chr15","chr16",
                "chr17","chr18","chr19","chr2","chr3","chr4","chr5","chr6",
                "chr7","chr8","chr9","chrX","chrY"
            ))
        },
        rn5 = {
            return(c(
                "chr1","chr10","chr11","chr12","chr13","chr14","chr15","chr16",
                "chr17","chr18","chr19","chr2","chr3","chr4","chr5","chr6",
                "chr7","chr8","chr9","chrX"
            ))
        },
        rn6 = {
            return(c(
                "chr1","chr10","chr11","chr12","chr13","chr14","chr15","chr16",
                "chr17","chr18","chr19","chr2","chr3","chr4","chr5","chr6",
                "chr7","chr8","chr9","chrX"
            ))
        },
        dm3 = {
            return(c(
                "chr2L","chr2LHet","chr2R","chr2RHet","chr3L","chr3LHet",
                "chr3R","chr3RHet","chr4","chrU","chrUextra","chrX","chrXHet",
                "chrYHet"
            ))
        },
        dm6 = {
            return(c(
                "chr2L","chr2LHet","chr2R","chr2RHet","chr3L","chr3LHet",
                "chr3R","chr3RHet","chr4","chrU","chrUextra","chrX","chrXHet",
                "chrYHet"
            ))
        },
        danrer7 = {
            return(c(
                "chr1","chr10","chr11","chr12","chr13","chr14","chr15","chr16",
                "chr17","chr18","chr19","chr2","chr20","chr21","chr22","chr23",
                "chr24","chr25","chr3","chr4","chr5","chr6","chr7","chr8","chr9"
            ))
        },
        pantro4 = {
            return(c(
                "chr1","chr10","chr11","chr12","chr13","chr14","chr15","chr16",
                "chr17","chr18","chr19","chr20","chr21","chr22","chr2A","chr2B",
                "chr3","chr4","chr5","chr6","chr7","chr8","chr9","chrX","chrY"
            ))
        },
        susscr3 = {
            return(c(
                "chr1","chr10","chr11","chr12","chr13","chr14","chr15","chr16",
                "chr17","chr18","chr2","chr3","chr4","chr5","chr6","chr7",
                "chr8","chr9","chrX","chrY"
            ))
        },
        tair10 = {
            return(c(
                "chr1","chr2","chr3","chr4","chr5"
            ))
        },
        equcab2 = {
            return(c(
                "chr1","chr10","chr11","chr12","chr13","chr14","chr15","chr16",
                "chr17","chr18","chr19","chr2","chr20","chr21","chr22","chr23",
                "chr24","chr25","chr26","chr27","chr28","chr29","chr3","chr30",
                "chr31","chr4","chr5","chr6","chr7","chr8","chr9","chrX","chrY"
            ))
        },
        bmori2 = {
            return(c(
                "chr1","chr10","chr11","chr12","chr13","chr14","chr15","chr16",
                "chr17","chr18","chr19","chr2","chr20","chr21","chr22","chr23",
                "chr24","chr25","chr26","chr27","chr28","chr3","chr4","chr5",
                "chr6","chr7","chr8","chr9"
            ))
        }
    )
}

#' Annotation downloader helper
#'
#' Returns a vector of genomic annotation attributes which are used by the biomaRt
#' package in order to fetch the gene annotation for each organism. It has no
#' parameters. Internal use.
#'
#' @param org one of the supported organisms.
#' @return A character vector of Ensembl gene attributes.
#' @export
#' @author Panagiotis Moulos
#' @examples
#' \dontrun{
#' gene.attr <- get.gene.attributes()
#'}
get.gene.attributes <- function(org) {
    if (org %in% c("hg18","mm9","tair10","bmori2"))
        return(c(
            "chromosome_name",
            "start_position",
            "end_position",
            "ensembl_gene_id",
            "percentage_gc_content",
            "strand",
            "external_gene_id",
            "gene_biotype"
        ))
    else
        return(c(
            "chromosome_name",
            "start_position",
            "end_position",
            "ensembl_gene_id",
            "percentage_gene_gc_content",
            "strand",
            "external_gene_name",
            "gene_biotype"
        ))
}

#' Annotation downloader helper
#'
#' Returns a vector of genomic annotation attributes which are used by the biomaRt
#' package in order to fetch the exon annotation for each organism. It has no
#' parameters. Internal use.
#'
#' @param org one of the supported organisms.
#' @return A character vector of Ensembl exon attributes.
#' @export
#' @author Panagiotis Moulos
#' @examples
#' \dontrun{
#' exon.attr <- get.exon.attributes()
#'}
get.exon.attributes <- function(org) {
    if (org %in% c("hg18","mm9","tair10","bmori2"))
        return(c(
            "chromosome_name",
            "exon_chrom_start",
            "exon_chrom_end",
            "ensembl_exon_id",
            "strand",
            "ensembl_gene_id",
            "external_gene_id",
            "gene_biotype"
        ))
    else if (org == "hg19")
        return(c(
            "chromosome_name",
            "exon_chrom_start",
            "exon_chrom_end",
            "ensembl_exon_id",
            "strand",
            "ensembl_gene_id",
            "gene_biotype"
        ))
    else
        return(c(
            "chromosome_name",
            "exon_chrom_start",
            "exon_chrom_end",
            "ensembl_exon_id",
            "strand",
            "ensembl_gene_id",
            "external_gene_name",
            "gene_biotype"
        ))
}

#' Annotation downloader helper
#'
#' Returns a vector of genomic annotation attributes which are used by the biomaRt
#' package in order to fetch the exon annotation for each organism. It has no
#' parameters. Internal use.
#'
#' @param org one of the supported organisms.
#' @return A character vector of Ensembl transcript attributes.
#' @export
#' @author Panagiotis Moulos
#' @examples
#' \dontrun{
#' trans.attr <- get.transcript.attributes()
#'}
get.transcript.utr.attributes <- function(org) {
    if (org %in% c("hg18","mm9","tair10","bmori2"))
        return(c(
            "chromosome_name",
            "transcript_start",
            "transcript_end",
            "3_utr_start",
            "3_utr_end",
            "ensembl_transcript_id",
            "strand",
            "ensembl_gene_id",
            "external_gene_id",
            "gene_biotype"
        ))
    else
        return(c(
            "chromosome_name",
            "transcript_start",
            "transcript_end",
            "3_utr_start",
            "3_utr_end",
            "ensembl_transcript_id",
            "strand",
            "ensembl_gene_id",
            "external_gene_name",
            "gene_biotype"
        ))
}

get.transcript.attributes <- function(org) {
    if (org %in% c("hg18","mm9","tair10","bmori2"))
        return(c(
            "chromosome_name",
            "transcript_start",
            "transcript_end",
            "ensembl_transcript_id",
            "strand",
            "ensembl_gene_id",
            "external_gene_id",
            "gene_biotype"
        ))
    else
        return(c(
            "chromosome_name",
            "transcript_start",
            "transcript_end",
            "ensembl_transcript_id",
            "strand",
            "ensembl_gene_id",
            "external_gene_name",
            "gene_biotype"
        ))
}

correct.transcripts <- function(ann) {
    rownames(ann) <- paste("T",1:nrow(ann),sep="_")
    len <- ann[,3] - ann[,2]
    len <- len[-which(is.na(len))]
    len[len==0] <- 1
    def.utr.len <- round(2^mean(log2(len)))
    nas <- which(is.na(ann$start))
    ann.na <- ann[nas,]
    ann.na$start <- ann.na$tstart
    ann.na$end <- ann.na$tend
    tmp <- makeGRangesFromDataFrame(df=ann.na)
    tmp <- flank(resize(tmp,width=1,fix="end"),start=FALSE,width=def.utr.len)
    ann[names(tmp),"start"] <- start(tmp)
    ann[names(tmp),"end"] <- end(tmp)
    return(ann)
}
pmoulos/metaseqr documentation built on Dec. 29, 2020, 5:56 a.m.