Nothing
#' Given the location of a whole spliced in exon, find transcripts which can splice out this exon
#' @param input whippetDataSet generated from \code{readWhippetDataSet()} or a Granges of exon coordinates
#' @param exons GRanges object made from a GTF containing exon coordinates
#' @param variableWidth How many nts overhang is allowed for finding matching exons
#' (default = 0, i.e. complete match)
#' @param findIntrons Find transcripts where the event occurs within the intron?
#' @param transcripts GRanges object made from a GTF containing transcript coordinates
#' (only required if findIntrons=TRUE)
#' @return data.frame with all overlapping exons
#' @export
#' @import GenomicRanges
#' @importFrom rtracklayer import
#' @family whippet splicing isoform creation
#' @author Beth Signal
#' @examples
#' whippetFiles <- system.file("extdata","whippet/",
#' package = "GeneStructureTools")
#' wds <- readWhippetDataSet(whippetFiles)
#' wds <- filterWhippetEvents(wds)
#'
#' gtf <- rtracklayer::import(system.file("extdata","example_gtf.gtf",
#' package = "GeneStructureTools"))
#' exons <- gtf[gtf$type=="exon"]
#' transcripts <- gtf[gtf$type=="transcript"]
#' g <- BSgenome.Mmusculus.UCSC.mm10::BSgenome.Mmusculus.UCSC.mm10
#'
#' wds.exonSkip <- filterWhippetEvents(wds, eventTypes="CE",psiDelta = 0.2)
#' exons.exonSkip <- findExonContainingTranscripts(wds.exonSkip, exons,
#' variableWidth=0, findIntrons=FALSE, transcripts)
#'
#' exonFromGRanges <- exons[exons$exon_id == "ENSMUSE00001271768.1"]
#' exons.exonSkip <- findExonContainingTranscripts(exonFromGRanges, exons,
#' variableWidth=0, findIntrons=FALSE, transcripts)
findExonContainingTranscripts <- function(input,
exons,
variableWidth=0,
findIntrons=FALSE,
transcripts){
if(class(input) == "whippetDataSet"){
# check all are CE
whippetDataSet <- filterWhippetEvents(input,
probability = 0,
psiDelta = 0,
eventTypes="CE")
eventCoords <- coordinates(whippetDataSet)
}else if(class(input) == "GRanges"){
eventCoords <- input
if(!("id" %in% names(mcols(eventCoords))) &
"exon_id" %in% names(mcols(eventCoords))){
eventCoords$id <- eventCoords$exon_id
}else{
stop("please specify \"id\" or \"exon_id\" in the input")
}
}
# remove any duplicates
overlaps <- GenomicRanges::findOverlaps(eventCoords, type="equal")
overlaps <- overlaps[which(overlaps@from != overlaps@to)]
if(length(overlaps) > 0){
overlaps <- overlaps[which(overlaps@from < overlaps@to)]
if(length(overlaps) > 0){
duplicates <- unique(overlaps@to)
eventCoords <- eventCoords[-duplicates]
}
}
# whole match
if(variableWidth == 0){
overlaps <- GenomicRanges::findOverlaps(eventCoords, exons,
type="equal")
gtf.equal <- exons[overlaps@to]
gtf.equal$from <- overlaps@from
}else{
# find all overlaps
overlaps <- GenomicRanges::findOverlaps(eventCoords, exons)
gtf.equal <- exons[overlaps@to]
gtf.equal$from <- overlaps@from
startDiff <- abs(start(gtf.equal) - start(eventCoords[gtf.equal$from]))
endDiff <- abs(end(gtf.equal) - end(eventCoords[gtf.equal$from]))
totalDiff <- startDiff + endDiff
# remove overlaps with change in start+end greater than specified
keep <- which(totalDiff <= variableWidth)
gtf.equal <- gtf.equal[keep]
}
gtf.equal$new_id <- paste(gtf.equal$transcript_id,gtf.equal$from, sep="_")
#gtf.equal$new_id <- with(as.data.frame(GenomicRanges::mcols(gtf.equal)),
# paste0(transcript_id, "_",from))
gtf.equal$exon_number <- as.numeric(gtf.equal$exon_number)
skippedExons <- as.data.frame(GenomicRanges::mcols(
gtf.equal)[,c('gene_id',
'transcript_id',
'transcript_type',
'from',
'exon_number')])
skippedExons$alt_id <- eventCoords$id[skippedExons$from]
skippedExons$from <- NULL
skippedExons$start <- start(gtf.equal)
skippedExons$end <- end(gtf.equal)
skippedExons$overlaps <- "exon"
if(findIntrons == TRUE){
# overlaps a transcript (i.e. can overlap an intron)
overlaps <- GenomicRanges::findOverlaps(eventCoords, transcripts)
overlapsDF <- as.data.frame(overlaps)
overlapsDF$from <- eventCoords$id[overlapsDF$queryHits]
overlapsDF$to <- transcripts$transcript_id[overlapsDF$subjectHits]
# annotate first/last exons
# (takes ~ 2.5 sec, please do before running this function multiple times)
if(!("first_last" %in% colnames(mcols(exons)))){
t <- as.data.frame(table(exons$transcript_id))
exons$first_last <- NA
exons$first_last[exons$exon_number == 1] <- "first"
exons$first_last[exons$exon_number ==
t$Freq[match(exons$transcript_id,
t$Var1)]] <- "last"
}
# check that skipped exon doesn't overlap the first/last exon
overlapsExons <- GenomicRanges::findOverlaps(
eventCoords, exons[which(exons$first_last %in% c("first","last"))])
removeTranscripts <- exons$transcript_id[
which(exons$first_last %in% c("first","last"))][overlapsExons@to]
overlapsDF <- overlapsDF[which(!(overlapsDF$to %in%
removeTranscripts)),]
# gtf with ALL exons where there is an intron overlap
gtf.within <- exons[
which(exons$transcript_id %in% overlapsDF$to)]
gtf.within$from <- overlapsDF$from[
match(gtf.within$transcript_id, overlapsDF$to)]
gtf.within <- gtf.within[
which(!(gtf.within$transcript_id %in% c(gtf.equal$transcript_id)))]
if(length(gtf.within) > 0){
# check for non-eact exon matches
overlappingExons <- as.data.frame(findOverlaps(eventCoords,
gtf.within))
overlappingExons$from_id <- eventCoords$id[
overlappingExons$queryHits]
overlappingExons <- cbind(overlappingExons,
as.data.frame(gtf.within[
overlappingExons$subjectHits]))
overlappingExons$to_id <- gtf.within$transcript_id[
overlappingExons$subjectHits]
gtf.within$new_id <- paste0(gtf.within$transcript_id,
"_",gtf.within$from)
gtf.within$exon_number <- 0
overlappingIntrons <- as.data.frame(GenomicRanges::mcols(
gtf.within)[,c('gene_id',
'transcript_id',
'transcript_type',
'from',
'exon_number')])
overlappingIntrons <- overlappingIntrons[
which(!(duplicated(paste0(overlappingIntrons$transcript_id, "-",
overlappingIntrons$from)))),]
overlappingIntrons$alt_id <- overlappingIntrons$from
overlappingIntrons$from <- NULL
overlappingIntrons$start <- start(eventCoords[
match(overlappingIntrons$alt_id,
eventCoords$id)])
overlappingIntrons$end <- end(eventCoords[
match(overlappingIntrons$alt_id,
eventCoords$id)])
overlappingIntrons$overlaps <- "intron"
# non exact matches (i.e. that have a partial intron overlap)
nonexact <- which(overlappingIntrons$transcript_id %in%
overlappingExons$to_id)
if(length(nonexact) > 0){
overlappingIntrons$overlaps[nonexact] <- "nonexact"
m <- match(overlappingIntrons$transcript_id,
overlappingExons$transcript_id)
# replace coordinates with known coordinates
overlappingIntrons$start <- overlappingExons$start[m]
overlappingIntrons$end <- overlappingExons$end[m]
overlappingIntrons$exon_number <-
overlappingExons$exon_number[m]
}
skippedExons <- rbind(skippedExons, overlappingIntrons)
}
}
return(skippedExons)
}
#' Remove and include a skipped exon from the transcripts it overlaps
#' @param skippedExons data.frame generataed by findExonContainingTranscripts()
#' @param exons GRanges object made from a GTF with ONLY exon annotations
#' (no gene, transcript, CDS etc.)
#' @param glueExons Join together exons that are not seperated by exons?
#' @param whippetDataSet whippetDataSet generated from \code{readWhippetDataSet()}
#' @param match what type of match replacement should be done?
#' exact: exact matches to the skipped event only, also removes any intron overlaps
#' skip: keep non-exact exon match coordinates in included sets, and skip them in skipped sets
#' replace: replace non-exact exon match coordinates with event coordinates in included sets,
#' and skip them in skipped sets
#' @return GRanges with transcripts skipping exons
#' @export
#' @import GenomicRanges
#' @importFrom plyr desc
#' @importFrom rtracklayer import
#' @family whippet splicing isoform creation
#' @author Beth Signal
#' @examples
#' whippetFiles <- system.file("extdata","whippet/",
#' package = "GeneStructureTools")
#' wds <- readWhippetDataSet(whippetFiles)
#' wds <- filterWhippetEvents(wds)
#'
#' gtf <- rtracklayer::import(system.file("extdata","example_gtf.gtf",
#' package = "GeneStructureTools"))
#' exons <- gtf[gtf$type=="exon"]
#' transcripts <- gtf[gtf$type=="transcript"]
#' g <- BSgenome.Mmusculus.UCSC.mm10::BSgenome.Mmusculus.UCSC.mm10
#'
#' wds.exonSkip <- filterWhippetEvents(wds, eventTypes="CE",psiDelta = 0.2)
#' exons.exonSkip <- findExonContainingTranscripts(wds.exonSkip, exons,
#' variableWidth=0, findIntrons=FALSE, transcripts)
#' ExonSkippingTranscripts <- skipExonInTranscript(exons.exonSkip, exons, whippetDataSet=wds.exonSkip)
#'
#' exonFromGRanges <- exons[exons$exon_id == "ENSMUSE00001271768.1"]
#' exons.exonSkip <- findExonContainingTranscripts(exonFromGRanges, exons,
#' variableWidth=0, findIntrons=FALSE, transcripts)
#' ExonSkippingTranscripts <- skipExonInTranscript(exons.exonSkip, exons, match="skip")
skipExonInTranscript <- function(skippedExons,
exons,
glueExons=TRUE,
whippetDataSet=NULL,
match="exact"){
if(!(match %in% c("exact","skip","replace"))){
message("match must be 'exact', 'skip', or 'replace'")
if(!is.null(whippetDataSet)){
message("using match = 'exact'")
match <- "exact"
}else{
message("using match = 'skip'")
match <- "skip"
}
}
if(is.null(whippetDataSet) & match=="exact"){
message("cannot use match = 'exact' without a whippetDataSet")
message("using match = 'skip'")
match <- "skip"
}
# if a whippet dataset is being used
if(!is.null(whippetDataSet)){
# check all are CE
whippetDataSet <- filterWhippetEvents(whippetDataSet,
probability = 0,
psiDelta = 0,
eventTypes="CE")
eventCoords <- coordinates(whippetDataSet)
# remove non-exact matches
if(match == "exact"){
m <- match(skippedExons$alt_id, eventCoords$id)
keep <- which((skippedExons$start) == start(eventCoords)[m] &
(skippedExons$end == end(eventCoords)[m]) &
skippedExons$overlaps=="exon")
skippedExons <- skippedExons[keep,]
}
eventCoords <- eventCoords[match(skippedExons$alt_id, eventCoords$id)]
}else{
m <- match(skippedExons$gene_id, exons$gene_id)
eventCoords <- GRanges(seqnames=seqnames(exons[m]),
ranges = IRanges(start=skippedExons$start,
end=skippedExons$end),
strand=strand(exons[m]),
id=skippedExons$alt_id)
}
eventCoords <- eventCoords[match(skippedExons$alt_id, eventCoords$id)]
eventCoords$exon_number <- skippedExons$exon_number
eventCoords$transcript_id <- skippedExons$transcript_id
eventCoords$transcript_type <- skippedExons$transcript_type
eventCoords$gene_id <- skippedExons$gene_id
eventCoords$exon_id <- eventCoords$id
eventCoords$overlaps <- skippedExons$overlaps
# replace exon coordinates with the event coordinates
if(match == "replace"){
oldStarts <- start(eventCoords)
oldEnds <- end(eventCoords)
}
start(eventCoords) <- skippedExons$start
end(eventCoords) <- skippedExons$end
# transcripts containing the exon
transcripts <- as.data.frame(table(skippedExons$transcript_id))
gtfTranscripts <- exons[exons$transcript_id %in% transcripts$Var1]
m <- match(gtfTranscripts$transcript_id, eventCoords$transcript_id)
mcols(gtfTranscripts) <- cbind(mcols(gtfTranscripts),
DataFrame(new_transcript_id=paste0(
gtfTranscripts$transcript_id,"+AS ",
eventCoords$exon_id[m])))
mcols(eventCoords) <- cbind(mcols(eventCoords),
DataFrame(new_transcript_id = paste0(
eventCoords$transcript_id,"+AS ",
eventCoords$exon_id)))
mcols(gtfTranscripts) <- mcols(
gtfTranscripts)[,c('gene_id','transcript_id',
'transcript_type','exon_id',
'exon_number', 'new_transcript_id')]
mcols(eventCoords) <- mcols(
eventCoords)[,c('gene_id','transcript_id',
'transcript_type','exon_id',
'exon_number','new_transcript_id',
'overlaps')]
needsDuplicated <- which(!(eventCoords$new_transcript_id %in%
gtfTranscripts$new_transcript_id))
if(length(needsDuplicated) > 0){
gtfTranscripts_add <- gtfTranscripts[
gtfTranscripts$transcript_id %in%
eventCoords$transcript_id[needsDuplicated]]
}
while(length(needsDuplicated) > 0){
gtfTranscripts_add <- gtfTranscripts_add[
gtfTranscripts_add$transcript_id %in%
eventCoords$transcript_id[needsDuplicated]]
m <- match(gtfTranscripts_add$transcript_id,
eventCoords$transcript_id[needsDuplicated])
gtfTranscripts_add$new_transcript_id <- paste0(
gtfTranscripts_add$transcript_id,"+AS ",
eventCoords$exon_id[needsDuplicated][m])
gtfTranscripts <- c(gtfTranscripts, gtfTranscripts_add)
needsDuplicated <- which(!(eventCoords$new_transcript_id %in%
gtfTranscripts$new_transcript_id))
}
exon_names <- with(as.data.frame(gtfTranscripts),
paste0(seqnames, ":", start,"-",end))
ol <- findOverlaps(gtfTranscripts, eventCoords, type="equal")
ol <- as.data.frame(ol)
ol$gtf_trans_id <- gtfTranscripts$new_transcript_id[ol$queryHits]
ol$eventCoords_id <- eventCoords$new_transcript_id[ol$subjectHits]
ol <- ol[ol$gtf_trans_id == ol$eventCoords_id,]
# add/remove exons from skipped isoforms if introns are used
if(match!="exact" & any(eventCoords$overlaps!="exon")){
ol.var <- findOverlaps(gtfTranscripts,
eventCoords[eventCoords$overlaps != "exon"])
ol.var <- as.data.frame(ol.var)
ol.var$gtf_trans_id <-
gtfTranscripts$new_transcript_id[ol.var$queryHits]
ol.var$eventCoords_id <- eventCoords$new_transcript_id[
which(eventCoords$overlaps != "exon")][ol.var$subjectHits]
ol.var <- ol.var[ol.var$gtf_trans_id == ol.var$eventCoords_id,]
if(nrow(ol.var) > 0){
ol <- rbind(ol, ol.var)
}
}
# remove the skipped exon
rm <- unique(ol$queryHits)
gtfTranscripts.rm <- gtfTranscripts[-rm]
mcols(gtfTranscripts.rm) <- mcols(
gtfTranscripts.rm)[,c('gene_id','new_transcript_id',
'transcript_type','exon_id',
'exon_number')]
colnames(mcols(gtfTranscripts.rm))[2] <- "transcript_id"
mcols(eventCoords) <- mcols(
eventCoords)[,c('gene_id','new_transcript_id',
'transcript_type','exon_id',
'exon_number','overlaps')]
colnames(mcols(eventCoords))[2] <- "transcript_id"
gtfTranscripts.rm$exon_number <- as.numeric(gtfTranscripts.rm$exon_number)
order <- order(gtfTranscripts.rm$transcript_id,
gtfTranscripts.rm$exon_number)
gtfTranscripts.rm <- gtfTranscripts.rm[order]
gtfTranscripts.rm$overlaps <- eventCoords$overlaps[
match(gtfTranscripts.rm$transcript_id, eventCoords$new_transcript_id)]
# add skipped exon back in
gtfTranscripts.withExon <- gtfTranscripts.rm
# replace with event coordinates
if(match == "replace"){
start(eventCoords) <- oldStarts
end(eventCoords) <- oldEnds
}
gtfTranscripts.withExon <- c(gtfTranscripts.withExon, eventCoords)
gtfTranscripts.withExon <- reorderExonNumbers(gtfTranscripts.withExon)
gtfTranscripts.rm$set <- "skipped_exon"
gtfTranscripts.withExon$set <- "included_exon"
gtfTranscripts.rm <- c(gtfTranscripts.rm, gtfTranscripts.withExon)
# rename skipped/included isoforms
gtfTranscripts.rm$transcript_id[which(
gtfTranscripts.rm$set=="skipped_exon")] <-
gsub("AS", "ASSE", gtfTranscripts.rm$transcript_id[
which(gtfTranscripts.rm$set=="skipped_exon")])
gtfTranscripts.rm$transcript_id[which(
gtfTranscripts.rm$set=="included_exon")] <-
gsub("AS", "ASIE", gtfTranscripts.rm$transcript_id[
which(gtfTranscripts.rm$set=="included_exon")])
#join together exons that are not seperated by an exon
if(glueExons==TRUE){
# split pos/neg ordering
gtfTranscripts.rm.neg <-
gtfTranscripts.rm[which(as.logical(strand(gtfTranscripts.rm) ==
"-"))]
order <- order(gtfTranscripts.rm.neg$transcript_id,
plyr::desc(gtfTranscripts.rm.neg$exon_number))
gtfTranscripts.rm.neg <- gtfTranscripts.rm.neg[order]
gtfTranscripts.rm.pos <-
gtfTranscripts.rm[which(as.logical(strand(gtfTranscripts.rm) ==
"+"))]
gtfTranscripts.rm <- c(gtfTranscripts.rm.pos,gtfTranscripts.rm.neg)
#extend starts <---<---<---
w <- which(end(ranges(gtfTranscripts.rm))[-length(gtfTranscripts.rm)] ==
start(ranges(gtfTranscripts.rm[-1])))
gtfTranscripts.rm <- gtfTranscripts.rm
GenomicRanges::start(GenomicRanges::ranges(gtfTranscripts.rm))[w+1] <-
GenomicRanges::start(GenomicRanges::ranges(gtfTranscripts.rm))[w]
# remove exons that cover now redundant regions
overlaps <- findOverlaps(gtfTranscripts.rm, type="within")
overlaps <- overlaps[overlaps@from == overlaps@to - 1]
rm <- unique(overlaps@from)
if(length(rm) > 0){
gtfTranscripts.rm <- gtfTranscripts.rm[-rm]
}
#extend ends --->--->--->
overlaps <- findOverlaps(gtfTranscripts.rm)
overlaps <- overlaps[overlaps@from==overlaps@to +1]
GenomicRanges::end(GenomicRanges::ranges(
gtfTranscripts.rm))[overlaps@to] <-
GenomicRanges::end(GenomicRanges::ranges(
gtfTranscripts.rm))[overlaps@from]
# remove exons that cover now redundant regions
overlaps <- findOverlaps(gtfTranscripts.rm, type="within")
overlaps <- overlaps[overlaps@from == overlaps@to + 1]
rm <- unique(overlaps@from)
if(length(rm) >0){
gtfTranscripts.rm <- gtfTranscripts.rm[-rm]
}
}
gtfTranscripts.rm$whippet_id <- unlist(lapply(stringr::str_split(
gtfTranscripts.rm$transcript_id, " "),"[[",2))
gtfTranscripts.rm$overlaps <- NULL
return(gtfTranscripts.rm)
}
#' Reorder the exon numbers in a gtf annotation
#' @param exons GRanges object made from a GTF with ONLY exon annotations
#' (no gene, transcript, CDS etc.)
#' @param by what column are the transcripts grouped by?
#' @return The same input GRanges, but with exon numbers reordered.
#' @export
#' @import GenomicRanges
#' @importFrom rtracklayer import
#' @family gtf manipulation
#' @author Beth Signal
#' @examples
#' gtf <- rtracklayer::import(system.file("extdata","example_gtf.gtf",
#' package = "GeneStructureTools"))
#' exons <- gtf[gtf$type=="exon"]
#' exons <- reorderExonNumbers(exons)
reorderExonNumbers <- function(exons, by="transcript_id"){
n <- which(colnames(mcols(exons)) == by)
order <- order(mcols(exons)[,n], start(exons))
exons <- exons[order]
transcriptTable <- as.data.frame(table(mcols(exons)[,n]))
transcriptTable$strand <- as.character(strand(exons[match(
transcriptTable$Var1,
mcols(exons)[,n])]))
exons$exon_number <-
as.numeric(unlist(apply(transcriptTable, 1,
function(x) if(x[3] == "+"){
c(1:(x[2]))}else{c((x[2]:1))})))
return(exons)
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.