R/ExonSkipping.R
In GeneStructureTools: Tools for spliced gene structure manipulation and analysis

Documented in findExonContainingTranscripts reorderExonNumbers skipExonInTranscript

#' Given the location of a whole spliced in exon, find transcripts which can splice out this exon
#' @param input whippetDataSet generated from \code{readWhippetDataSet()} or a Granges of exon coordinates
#' @param exons GRanges object made from a GTF containing exon coordinates
#' @param variableWidth How many nts overhang is allowed for finding matching exons
#' (default = 0, i.e. complete match)
#' @param findIntrons Find transcripts where the event occurs within the intron?
#' @param transcripts GRanges object made from a GTF containing transcript coordinates
#' (only required if findIntrons=TRUE)
#' @return data.frame with all overlapping exons
#' @export
#' @import GenomicRanges
#' @importFrom rtracklayer import
#' @family whippet splicing isoform creation
#' @author Beth Signal
#' @examples
#' whippetFiles <- system.file("extdata","whippet/",
#' package = "GeneStructureTools")
#' wds <- readWhippetDataSet(whippetFiles)
#' wds <- filterWhippetEvents(wds)
#'
#' gtf <- rtracklayer::import(system.file("extdata","example_gtf.gtf",
#' package = "GeneStructureTools"))
#' exons <- gtf[gtf$type=="exon"]
#' transcripts <- gtf[gtf$type=="transcript"]
#' g <- BSgenome.Mmusculus.UCSC.mm10::BSgenome.Mmusculus.UCSC.mm10
#'
#' wds.exonSkip <- filterWhippetEvents(wds, eventTypes="CE",psiDelta = 0.2)
#' exons.exonSkip <- findExonContainingTranscripts(wds.exonSkip, exons,
#' variableWidth=0, findIntrons=FALSE, transcripts)
#'
#' exonFromGRanges <- exons[exons$exon_id == "ENSMUSE00001271768.1"]
#' exons.exonSkip <- findExonContainingTranscripts(exonFromGRanges, exons,
#' variableWidth=0, findIntrons=FALSE, transcripts)
findExonContainingTranscripts <- function(input,
                                          exons,
                                          variableWidth=0,
                                          findIntrons=FALSE,
                                          transcripts){

    if(class(input) == "whippetDataSet"){
        # check all are CE
        whippetDataSet <- filterWhippetEvents(input,
                                              probability = 0,
                                              psiDelta = 0,
                                              eventTypes="CE")

        eventCoords <- coordinates(whippetDataSet)
    }else if(class(input) == "GRanges"){
        eventCoords <- input
        if(!("id" %in% names(mcols(eventCoords))) &
           "exon_id" %in% names(mcols(eventCoords))){
            eventCoords$id <- eventCoords$exon_id
        }else{
            stop("please specify \"id\" or \"exon_id\" in the input")
        }
    }

    # remove any duplicates
    overlaps <- GenomicRanges::findOverlaps(eventCoords, type="equal")

    overlaps <- overlaps[which(overlaps@from != overlaps@to)]
    if(length(overlaps) > 0){
        overlaps <- overlaps[which(overlaps@from < overlaps@to)]
        if(length(overlaps) > 0){
            duplicates <- unique(overlaps@to)
            eventCoords <- eventCoords[-duplicates]
        }
    }

    # whole match
    if(variableWidth == 0){
        overlaps <- GenomicRanges::findOverlaps(eventCoords, exons,
                                                type="equal")
        gtf.equal <- exons[overlaps@to]
        gtf.equal$from <- overlaps@from
    }else{
        # find all overlaps
        overlaps <- GenomicRanges::findOverlaps(eventCoords, exons)
        gtf.equal <- exons[overlaps@to]
        gtf.equal$from <- overlaps@from
        startDiff <- abs(start(gtf.equal) - start(eventCoords[gtf.equal$from]))
        endDiff <- abs(end(gtf.equal) - end(eventCoords[gtf.equal$from]))
        totalDiff <- startDiff + endDiff
        # remove overlaps with change in start+end greater than specified
        keep <- which(totalDiff <= variableWidth)
        gtf.equal <- gtf.equal[keep]
    }

    gtf.equal$new_id <- paste(gtf.equal$transcript_id,gtf.equal$from, sep="_")
    #gtf.equal$new_id <- with(as.data.frame(GenomicRanges::mcols(gtf.equal)),
    # paste0(transcript_id, "_",from))
    gtf.equal$exon_number <- as.numeric(gtf.equal$exon_number)
    skippedExons <- as.data.frame(GenomicRanges::mcols(
        gtf.equal)[,c('gene_id',
                      'transcript_id',
                      'transcript_type',
                      'from',
                      'exon_number')])
    skippedExons$alt_id <- eventCoords$id[skippedExons$from]
    skippedExons$from <- NULL
    skippedExons$start <- start(gtf.equal)
    skippedExons$end <- end(gtf.equal)
    skippedExons$overlaps <- "exon"


    if(findIntrons == TRUE){
        # overlaps a transcript (i.e. can overlap an intron)
        overlaps <- GenomicRanges::findOverlaps(eventCoords, transcripts)
        overlapsDF <- as.data.frame(overlaps)
        overlapsDF$from <- eventCoords$id[overlapsDF$queryHits]
        overlapsDF$to <- transcripts$transcript_id[overlapsDF$subjectHits]

        # annotate first/last exons
        # (takes ~ 2.5 sec, please do before running this function multiple times)
        if(!("first_last" %in% colnames(mcols(exons)))){
            t <- as.data.frame(table(exons$transcript_id))
            exons$first_last <- NA
            exons$first_last[exons$exon_number == 1] <- "first"
            exons$first_last[exons$exon_number ==
                                 t$Freq[match(exons$transcript_id,
                                              t$Var1)]] <- "last"
        }

        # check that skipped exon doesn't overlap the first/last exon
        overlapsExons <- GenomicRanges::findOverlaps(
            eventCoords, exons[which(exons$first_last %in% c("first","last"))])
        removeTranscripts <- exons$transcript_id[
            which(exons$first_last %in% c("first","last"))][overlapsExons@to]
        overlapsDF <- overlapsDF[which(!(overlapsDF$to %in%
                                             removeTranscripts)),]

        # gtf with ALL exons where there is an intron overlap
        gtf.within <- exons[
            which(exons$transcript_id %in% overlapsDF$to)]
        gtf.within$from <- overlapsDF$from[
            match(gtf.within$transcript_id, overlapsDF$to)]
        gtf.within <- gtf.within[
            which(!(gtf.within$transcript_id %in% c(gtf.equal$transcript_id)))]

        if(length(gtf.within) > 0){

            # check for non-eact exon matches
            overlappingExons <- as.data.frame(findOverlaps(eventCoords,
                                                           gtf.within))
            overlappingExons$from_id <- eventCoords$id[
                overlappingExons$queryHits]
            overlappingExons <- cbind(overlappingExons,
                                      as.data.frame(gtf.within[
                                          overlappingExons$subjectHits]))
            overlappingExons$to_id <- gtf.within$transcript_id[
                overlappingExons$subjectHits]


            gtf.within$new_id <- paste0(gtf.within$transcript_id,
                                        "_",gtf.within$from)
            gtf.within$exon_number <- 0

            overlappingIntrons <- as.data.frame(GenomicRanges::mcols(
                gtf.within)[,c('gene_id',
                               'transcript_id',
                               'transcript_type',
                               'from',
                               'exon_number')])


            overlappingIntrons <- overlappingIntrons[
                which(!(duplicated(paste0(overlappingIntrons$transcript_id, "-",
                                          overlappingIntrons$from)))),]
            overlappingIntrons$alt_id <- overlappingIntrons$from
            overlappingIntrons$from <- NULL
            overlappingIntrons$start <- start(eventCoords[
                match(overlappingIntrons$alt_id,
                      eventCoords$id)])
            overlappingIntrons$end <- end(eventCoords[
                match(overlappingIntrons$alt_id,
                      eventCoords$id)])
            overlappingIntrons$overlaps <- "intron"



            # non exact matches (i.e. that have a partial intron overlap)
            nonexact <- which(overlappingIntrons$transcript_id %in%
                                  overlappingExons$to_id)
            if(length(nonexact) > 0){
                overlappingIntrons$overlaps[nonexact] <- "nonexact"
                m <- match(overlappingIntrons$transcript_id,
                           overlappingExons$transcript_id)
                # replace coordinates with known coordinates
                overlappingIntrons$start <- overlappingExons$start[m]
                overlappingIntrons$end <- overlappingExons$end[m]
                overlappingIntrons$exon_number <-
                    overlappingExons$exon_number[m]
            }

            skippedExons <- rbind(skippedExons, overlappingIntrons)

        }
    }


    return(skippedExons)
}

#' Remove and include a skipped exon from the transcripts it overlaps
#' @param skippedExons data.frame generataed by findExonContainingTranscripts()
#' @param exons GRanges object made from a GTF with ONLY exon annotations
#' (no gene, transcript, CDS etc.)
#' @param glueExons Join together exons that are not seperated by exons?
#' @param whippetDataSet whippetDataSet generated from \code{readWhippetDataSet()}
#' @param match what type of match replacement should be done?
#' exact: exact matches to the skipped event only, also removes any intron overlaps
#' skip: keep non-exact exon match coordinates in included sets, and skip them in skipped sets
#' replace: replace non-exact exon match coordinates with event coordinates in included sets,
#' and skip them in skipped sets
#' @return GRanges with transcripts skipping exons
#' @export
#' @import GenomicRanges
#' @importFrom plyr desc
#' @importFrom rtracklayer import
#' @family whippet splicing isoform creation
#' @author Beth Signal
#' @examples
#' whippetFiles <- system.file("extdata","whippet/",
#' package = "GeneStructureTools")
#' wds <- readWhippetDataSet(whippetFiles)
#' wds <- filterWhippetEvents(wds)
#'
#' gtf <- rtracklayer::import(system.file("extdata","example_gtf.gtf",
#' package = "GeneStructureTools"))
#' exons <- gtf[gtf$type=="exon"]
#' transcripts <- gtf[gtf$type=="transcript"]
#' g <- BSgenome.Mmusculus.UCSC.mm10::BSgenome.Mmusculus.UCSC.mm10
#'
#' wds.exonSkip <- filterWhippetEvents(wds, eventTypes="CE",psiDelta = 0.2)
#' exons.exonSkip <- findExonContainingTranscripts(wds.exonSkip, exons,
#' variableWidth=0, findIntrons=FALSE, transcripts)
#' ExonSkippingTranscripts <- skipExonInTranscript(exons.exonSkip, exons, whippetDataSet=wds.exonSkip)
#'
#' exonFromGRanges <- exons[exons$exon_id == "ENSMUSE00001271768.1"]
#' exons.exonSkip <- findExonContainingTranscripts(exonFromGRanges, exons,
#' variableWidth=0, findIntrons=FALSE, transcripts)
#' ExonSkippingTranscripts <- skipExonInTranscript(exons.exonSkip, exons, match="skip")
skipExonInTranscript <- function(skippedExons,
                                 exons,
                                 glueExons=TRUE,
                                 whippetDataSet=NULL,
                                 match="exact"){

    if(!(match %in% c("exact","skip","replace"))){
        message("match must be 'exact', 'skip', or 'replace'")

        if(!is.null(whippetDataSet)){
            message("using match = 'exact'")
            match <- "exact"
        }else{
            message("using match = 'skip'")
            match <- "skip"
        }
    }
    if(is.null(whippetDataSet) & match=="exact"){
        message("cannot use match = 'exact' without a whippetDataSet")
        message("using match = 'skip'")
        match <- "skip"
    }

    # if a whippet dataset is being used
    if(!is.null(whippetDataSet)){
        # check all are CE
        whippetDataSet <- filterWhippetEvents(whippetDataSet,
                                              probability = 0,
                                              psiDelta = 0,
                                              eventTypes="CE")

        eventCoords <- coordinates(whippetDataSet)

        # remove non-exact matches
        if(match == "exact"){
            m <- match(skippedExons$alt_id, eventCoords$id)
            keep <- which((skippedExons$start) == start(eventCoords)[m] &
                              (skippedExons$end == end(eventCoords)[m]) &
                              skippedExons$overlaps=="exon")
            skippedExons <- skippedExons[keep,]
        }

        eventCoords <- eventCoords[match(skippedExons$alt_id, eventCoords$id)]
    }else{
        m <- match(skippedExons$gene_id, exons$gene_id)
        eventCoords <- GRanges(seqnames=seqnames(exons[m]),
                               ranges = IRanges(start=skippedExons$start,
                                                end=skippedExons$end),
                               strand=strand(exons[m]),
                               id=skippedExons$alt_id)
    }
    eventCoords <- eventCoords[match(skippedExons$alt_id, eventCoords$id)]
    eventCoords$exon_number <- skippedExons$exon_number
    eventCoords$transcript_id <- skippedExons$transcript_id
    eventCoords$transcript_type <- skippedExons$transcript_type
    eventCoords$gene_id <- skippedExons$gene_id
    eventCoords$exon_id <- eventCoords$id
    eventCoords$overlaps <- skippedExons$overlaps

    # replace exon coordinates with the event coordinates
    if(match == "replace"){
        oldStarts <- start(eventCoords)
        oldEnds <- end(eventCoords)
    }
    start(eventCoords) <- skippedExons$start
    end(eventCoords) <- skippedExons$end


    # transcripts containing the exon
    transcripts <- as.data.frame(table(skippedExons$transcript_id))
    gtfTranscripts <- exons[exons$transcript_id %in% transcripts$Var1]
    m <- match(gtfTranscripts$transcript_id, eventCoords$transcript_id)
    mcols(gtfTranscripts) <- cbind(mcols(gtfTranscripts),
                                   DataFrame(new_transcript_id=paste0(
                                       gtfTranscripts$transcript_id,"+AS ",
                                       eventCoords$exon_id[m])))
    mcols(eventCoords) <- cbind(mcols(eventCoords),
                                DataFrame(new_transcript_id = paste0(
                                    eventCoords$transcript_id,"+AS ",
                                    eventCoords$exon_id)))

    mcols(gtfTranscripts) <- mcols(
        gtfTranscripts)[,c('gene_id','transcript_id',
                           'transcript_type','exon_id',
                           'exon_number', 'new_transcript_id')]
    mcols(eventCoords) <- mcols(
        eventCoords)[,c('gene_id','transcript_id',
                        'transcript_type','exon_id',
                        'exon_number','new_transcript_id',
                        'overlaps')]

    needsDuplicated <- which(!(eventCoords$new_transcript_id %in%
                                   gtfTranscripts$new_transcript_id))

    if(length(needsDuplicated) > 0){
        gtfTranscripts_add <- gtfTranscripts[
            gtfTranscripts$transcript_id %in%
                eventCoords$transcript_id[needsDuplicated]]
    }

    while(length(needsDuplicated) > 0){
        gtfTranscripts_add <- gtfTranscripts_add[
            gtfTranscripts_add$transcript_id %in%
                eventCoords$transcript_id[needsDuplicated]]
        m <- match(gtfTranscripts_add$transcript_id,
                   eventCoords$transcript_id[needsDuplicated])
        gtfTranscripts_add$new_transcript_id <- paste0(
            gtfTranscripts_add$transcript_id,"+AS ",
            eventCoords$exon_id[needsDuplicated][m])
        gtfTranscripts <- c(gtfTranscripts, gtfTranscripts_add)
        needsDuplicated <- which(!(eventCoords$new_transcript_id %in%
                                       gtfTranscripts$new_transcript_id))
    }

    exon_names <- with(as.data.frame(gtfTranscripts),
                       paste0(seqnames, ":", start,"-",end))

    ol <- findOverlaps(gtfTranscripts, eventCoords, type="equal")
    ol <- as.data.frame(ol)
    ol$gtf_trans_id <- gtfTranscripts$new_transcript_id[ol$queryHits]
    ol$eventCoords_id <- eventCoords$new_transcript_id[ol$subjectHits]
    ol <- ol[ol$gtf_trans_id == ol$eventCoords_id,]

    # add/remove exons from skipped isoforms if introns are used
    if(match!="exact" & any(eventCoords$overlaps!="exon")){
        ol.var <- findOverlaps(gtfTranscripts,
                               eventCoords[eventCoords$overlaps != "exon"])
        ol.var <- as.data.frame(ol.var)
        ol.var$gtf_trans_id <-
            gtfTranscripts$new_transcript_id[ol.var$queryHits]
        ol.var$eventCoords_id <- eventCoords$new_transcript_id[
            which(eventCoords$overlaps != "exon")][ol.var$subjectHits]
        ol.var <- ol.var[ol.var$gtf_trans_id == ol.var$eventCoords_id,]
        if(nrow(ol.var) > 0){
            ol <- rbind(ol, ol.var)
        }
    }

    # remove the skipped exon
    rm <- unique(ol$queryHits)
    gtfTranscripts.rm <- gtfTranscripts[-rm]

    mcols(gtfTranscripts.rm) <- mcols(
        gtfTranscripts.rm)[,c('gene_id','new_transcript_id',
                              'transcript_type','exon_id',
                              'exon_number')]
    colnames(mcols(gtfTranscripts.rm))[2] <- "transcript_id"

    mcols(eventCoords) <- mcols(
        eventCoords)[,c('gene_id','new_transcript_id',
                        'transcript_type','exon_id',
                        'exon_number','overlaps')]
    colnames(mcols(eventCoords))[2] <- "transcript_id"

    gtfTranscripts.rm$exon_number <- as.numeric(gtfTranscripts.rm$exon_number)
    order <- order(gtfTranscripts.rm$transcript_id,
                   gtfTranscripts.rm$exon_number)
    gtfTranscripts.rm <- gtfTranscripts.rm[order]
    gtfTranscripts.rm$overlaps <- eventCoords$overlaps[
        match(gtfTranscripts.rm$transcript_id, eventCoords$new_transcript_id)]

    # add skipped exon back in
    gtfTranscripts.withExon <- gtfTranscripts.rm
    # replace with event coordinates
    if(match == "replace"){
        start(eventCoords) <- oldStarts
        end(eventCoords) <- oldEnds
    }
    gtfTranscripts.withExon <- c(gtfTranscripts.withExon, eventCoords)
    gtfTranscripts.withExon <- reorderExonNumbers(gtfTranscripts.withExon)

    gtfTranscripts.rm$set <- "skipped_exon"
    gtfTranscripts.withExon$set <- "included_exon"

    gtfTranscripts.rm <- c(gtfTranscripts.rm, gtfTranscripts.withExon)

    # rename skipped/included isoforms
    gtfTranscripts.rm$transcript_id[which(
        gtfTranscripts.rm$set=="skipped_exon")] <-
        gsub("AS", "ASSE", gtfTranscripts.rm$transcript_id[
            which(gtfTranscripts.rm$set=="skipped_exon")])
    gtfTranscripts.rm$transcript_id[which(
        gtfTranscripts.rm$set=="included_exon")] <-
        gsub("AS", "ASIE", gtfTranscripts.rm$transcript_id[
            which(gtfTranscripts.rm$set=="included_exon")])

    #join together exons that are not seperated by an exon
    if(glueExons==TRUE){

        # split pos/neg ordering
        gtfTranscripts.rm.neg <-
            gtfTranscripts.rm[which(as.logical(strand(gtfTranscripts.rm) ==
                                                   "-"))]
        order <- order(gtfTranscripts.rm.neg$transcript_id,
                       plyr::desc(gtfTranscripts.rm.neg$exon_number))
        gtfTranscripts.rm.neg <- gtfTranscripts.rm.neg[order]

        gtfTranscripts.rm.pos <-
            gtfTranscripts.rm[which(as.logical(strand(gtfTranscripts.rm) ==
                                                   "+"))]
        gtfTranscripts.rm <- c(gtfTranscripts.rm.pos,gtfTranscripts.rm.neg)

        #extend starts <---<---<---
        w <- which(end(ranges(gtfTranscripts.rm))[-length(gtfTranscripts.rm)] ==
                       start(ranges(gtfTranscripts.rm[-1])))
        gtfTranscripts.rm <- gtfTranscripts.rm
        GenomicRanges::start(GenomicRanges::ranges(gtfTranscripts.rm))[w+1] <-
            GenomicRanges::start(GenomicRanges::ranges(gtfTranscripts.rm))[w]

        # remove exons that cover now redundant regions
        overlaps <- findOverlaps(gtfTranscripts.rm, type="within")
        overlaps <- overlaps[overlaps@from == overlaps@to - 1]
        rm <- unique(overlaps@from)
        if(length(rm) > 0){
            gtfTranscripts.rm <- gtfTranscripts.rm[-rm]
        }

        #extend ends --->--->--->
        overlaps <- findOverlaps(gtfTranscripts.rm)
        overlaps <- overlaps[overlaps@from==overlaps@to +1]

        GenomicRanges::end(GenomicRanges::ranges(
            gtfTranscripts.rm))[overlaps@to] <-
            GenomicRanges::end(GenomicRanges::ranges(
                gtfTranscripts.rm))[overlaps@from]

        # remove exons that cover now redundant regions
        overlaps <- findOverlaps(gtfTranscripts.rm, type="within")
        overlaps <- overlaps[overlaps@from == overlaps@to + 1]

        rm <- unique(overlaps@from)
        if(length(rm) >0){
            gtfTranscripts.rm <- gtfTranscripts.rm[-rm]
        }

    }
    gtfTranscripts.rm$whippet_id <- unlist(lapply(stringr::str_split(
        gtfTranscripts.rm$transcript_id, " "),"[[",2))
    gtfTranscripts.rm$overlaps <- NULL
    return(gtfTranscripts.rm)
}

#' Reorder the exon numbers in a gtf annotation
#' @param exons GRanges object made from a GTF with ONLY exon annotations
#' (no gene, transcript, CDS etc.)
#' @param by what column are the transcripts grouped by?
#' @return The same input GRanges, but with exon numbers reordered.
#' @export
#' @import GenomicRanges
#' @importFrom rtracklayer import
#' @family gtf manipulation
#' @author Beth Signal
#' @examples
#' gtf <- rtracklayer::import(system.file("extdata","example_gtf.gtf",
#' package = "GeneStructureTools"))
#' exons <- gtf[gtf$type=="exon"]
#' exons <- reorderExonNumbers(exons)
reorderExonNumbers <- function(exons, by="transcript_id"){
    n <- which(colnames(mcols(exons)) == by)

    order <- order(mcols(exons)[,n], start(exons))

    exons <- exons[order]

    transcriptTable <- as.data.frame(table(mcols(exons)[,n]))
    transcriptTable$strand <- as.character(strand(exons[match(
        transcriptTable$Var1,
        mcols(exons)[,n])]))

    exons$exon_number <-
        as.numeric(unlist(apply(transcriptTable, 1,
                                function(x) if(x[3] == "+"){
                                    c(1:(x[2]))}else{c((x[2]:1))})))
    return(exons)
}
Any scripts or data that you put into this service are public.
GeneStructureTools documentation built on Nov. 8, 2020, 6:04 p.m.
rdrr.io home R language documentation Run R code online
CRAN packages Bioconductor packages R-Forge packages GitHub packages
Note that we can't provide technical support on individual packages. You should contact the package authors for that.
GeneStructureTools
Tools for spliced gene structure manipulation and analysis

R/ExonSkipping.R
In GeneStructureTools: Tools for spliced gene structure manipulation and analysis

Defines functions reorderExonNumbers skipExonInTranscript findExonContainingTranscripts

Documented in findExonContainingTranscripts reorderExonNumbers skipExonInTranscript

Try the GeneStructureTools package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

GeneStructureTools Tools for spliced gene structure manipulation and analysis

R/ExonSkipping.R In GeneStructureTools: Tools for spliced gene structure manipulation and analysis

Defines functions reorderExonNumbers skipExonInTranscript findExonContainingTranscripts

Documented in findExonContainingTranscripts reorderExonNumbers skipExonInTranscript

Try the GeneStructureTools package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

GeneStructureTools
Tools for spliced gene structure manipulation and analysis

R/ExonSkipping.R
In GeneStructureTools: Tools for spliced gene structure manipulation and analysis