#' Given the location of a whole spliced in exon, find transcripts which can splice out this exon
#' @param input whippetDataSet generated from \code{readWhippetDataSet()} or a Granges of exon coordinates
#' @param exons GRanges object made from a GTF containing exon coordinates
#' @param variableWidth How many nts overhang is allowed for finding matching exons
#' (default = 0, i.e. complete match)
#' @param findIntrons Find transcripts where the event occurs within the intron?
#' (only required if findIntrons=TRUE)
#' @return data.frame with all overlapping exons
#' @export
#' @import GenomicRanges
#' @importFrom rtracklayer import
#' @family whippet splicing isoform creation
#' @author Beth Signal
#' @examples
#' gtf <- rtracklayer::import(system.file("extdata", "gencode.vM25.small.gtf", package = "GeneStructureTools"))
#' exons <- gtf[gtf$type == "exon"]
#' g <- BSgenome.Mmusculus.UCSC.mm10::BSgenome.Mmusculus.UCSC.mm10
#'
#' whippetFiles <- system.file("extdata", "whippet_small/",
#' package = "GeneStructureTools"
#' )
#' wds <- readWhippetDataSet(whippetFiles)
#'
#' wds.exonSkip <- filterWhippetEvents(wds, eventTypes = "CE", psiDelta = 0.2)
#' exons.exonSkip <- findExonContainingTranscripts(wds.exonSkip, exons,
#' variableWidth = 0, findIntrons = FALSE
#' )
#'
#' exonFromGRanges <- exons[exons$exon_id == "ENSMUSE00000414559.2"]
#' exons.exonSkip <- findExonContainingTranscripts(exonFromGRanges, exons,
#' variableWidth = 0, findIntrons = FALSE
#' )
findExonContainingTranscripts <- function(input,
exons,
variableWidth = 0,
findIntrons = FALSE) {
if (is(input, "whippetDataSet")) {
# check all are CE
whippetDataSet <- filterWhippetEvents(input,
probability = 0,
psiDelta = 0,
eventTypes = "CE"
)
eventCoords <- coordinates(whippetDataSet)
} else if (is(input, "GRanges")) {
eventCoords <- input
if (!("id" %in% names(mcols(eventCoords))) &
"exon_id" %in% names(mcols(eventCoords))) {
eventCoords$id <- eventCoords$exon_id
} else {
stop("please specify \"id\" or \"exon_id\" in the input")
}
}
# remove any duplicates
overlaps <- GenomicRanges::findOverlaps(eventCoords, type = "equal")
overlaps <- overlaps[which(overlaps@from != overlaps@to)]
if (length(overlaps) > 0) {
overlaps <- overlaps[which(overlaps@from < overlaps@to)]
if (length(overlaps) > 0) {
duplicates <- unique(overlaps@to)
eventCoords <- eventCoords[-duplicates]
}
}
# whole match
if (variableWidth == 0) {
overlaps <- GenomicRanges::findOverlaps(eventCoords, exons,
type = "equal"
)
gtf.equal <- exons[overlaps@to]
gtf.equal$from <- overlaps@from
} else {
# find all overlaps
overlaps <- GenomicRanges::findOverlaps(eventCoords, exons)
gtf.equal <- exons[overlaps@to]
gtf.equal$from <- overlaps@from
startDiff <- abs(start(gtf.equal) - start(eventCoords[gtf.equal$from]))
endDiff <- abs(end(gtf.equal) - end(eventCoords[gtf.equal$from]))
totalDiff <- startDiff + endDiff
# remove overlaps with change in start+end greater than specified
keep <- which(totalDiff <= variableWidth)
gtf.equal <- gtf.equal[keep]
}
gtf.equal$new_id <- paste(gtf.equal$transcript_id, gtf.equal$from, sep = "_")
# gtf.equal$new_id <- with(as.data.frame(GenomicRanges::mcols(gtf.equal)),
# paste0(transcript_id, "_",from))
gtf.equal$exon_number <- as.numeric(gtf.equal$exon_number)
skippedExons <- as.data.frame(GenomicRanges::mcols(
gtf.equal
)[, c(
"gene_id",
"transcript_id",
"transcript_type",
"from",
"exon_number"
)])
skippedExons$alt_id <- eventCoords$id[skippedExons$from]
skippedExons$from <- NULL
skippedExons$start <- start(gtf.equal)
skippedExons$end <- end(gtf.equal)
skippedExons$overlaps <- "exon"
if (findIntrons == TRUE) {
# make a transcripts GRanges
transcripts <- exonsToTranscripts(exons)
# overlaps a transcript (i.e. can overlap an intron)
overlaps <- GenomicRanges::findOverlaps(eventCoords, transcripts)
overlapsDF <- as.data.frame(overlaps)
overlapsDF$from <- eventCoords$id[overlapsDF$queryHits]
overlapsDF$to <- transcripts$transcript_id[overlapsDF$subjectHits]
# annotate first/last exons
# (takes ~ 2.5 sec, please do before running this function multiple times)
if (!("first_last" %in% colnames(mcols(exons)))) {
t <- as.data.frame(table(exons$transcript_id))
exons$first_last <- NA
exons$first_last[exons$exon_number == 1] <- "first"
exons$first_last[exons$exon_number ==
t$Freq[match(
exons$transcript_id,
t$Var1
)]] <- "last"
}
# check that skipped exon doesn't overlap the first/last exon
overlapsExons <- GenomicRanges::findOverlaps(
eventCoords, exons[which(exons$first_last %in% c("first", "last"))]
)
removeTranscripts <- exons$transcript_id[
which(exons$first_last %in% c("first", "last"))
][overlapsExons@to]
overlapsDF <- overlapsDF[which(!(overlapsDF$to %in%
removeTranscripts)), ]
# gtf with ALL exons where there is an intron overlap
gtf.within <- exons[
which(exons$transcript_id %in% overlapsDF$to)
]
gtf.within$from <- overlapsDF$from[
match(gtf.within$transcript_id, overlapsDF$to)
]
gtf.within <- gtf.within[
which(!(gtf.within$transcript_id %in% c(gtf.equal$transcript_id)))
]
if (length(gtf.within) > 0) {
# check for non-eact exon matches
overlappingExons <- as.data.frame(findOverlaps(
eventCoords,
gtf.within
))
overlappingExons$from_id <- eventCoords$id[
overlappingExons$queryHits
]
overlappingExons <- cbind(
overlappingExons,
as.data.frame(gtf.within[
overlappingExons$subjectHits
])
)
overlappingExons$to_id <- gtf.within$transcript_id[
overlappingExons$subjectHits
]
gtf.within$new_id <- paste0(
gtf.within$transcript_id,
"_", gtf.within$from
)
gtf.within$exon_number <- 0
overlappingIntrons <- as.data.frame(GenomicRanges::mcols(
gtf.within
)[, c(
"gene_id",
"transcript_id",
"transcript_type",
"from",
"exon_number"
)])
overlappingIntrons <- overlappingIntrons[
which(!(duplicated(paste0(
overlappingIntrons$transcript_id, "-",
overlappingIntrons$from
)))),
]
overlappingIntrons$alt_id <- overlappingIntrons$from
overlappingIntrons$from <- NULL
overlappingIntrons$start <- start(eventCoords[
match(
overlappingIntrons$alt_id,
eventCoords$id
)
])
overlappingIntrons$end <- end(eventCoords[
match(
overlappingIntrons$alt_id,
eventCoords$id
)
])
overlappingIntrons$overlaps <- "intron"
# non exact matches (i.e. that have a partial intron overlap)
nonexact <- which(overlappingIntrons$transcript_id %in%
overlappingExons$to_id)
if (length(nonexact) > 0) {
overlappingIntrons$overlaps[nonexact] <- "nonexact"
m <- match(
overlappingIntrons$transcript_id,
overlappingExons$transcript_id
)
# replace coordinates with known coordinates
overlappingIntrons$start[nonexact] <- overlappingExons$start[m][nonexact]
overlappingIntrons$end[nonexact] <- overlappingExons$end[m][nonexact]
overlappingIntrons$exon_number[nonexact] <-
overlappingExons$exon_number[m][nonexact]
}
skippedExons <- rbind(skippedExons, overlappingIntrons)
}
}
return(skippedExons)
}
#' Remove and include a skipped exon from the transcripts it overlaps
#' @param skippedExons data.frame generataed by findExonContainingTranscripts()
#' @param exons GRanges object made from a GTF with ONLY exon annotations
#' (no gene, transcript, CDS etc.)
#' @param glueExons Join together exons that are not seperated by exons?
#' @param whippetDataSet whippetDataSet generated from \code{readWhippetDataSet()}
#' @param match what type of match replacement should be done?
#' exact: exact matches to the skipped event only, also removes any intron overlaps
#' skip: keep non-exact exon match coordinates in included sets, and skip them in skipped sets
#' replace: replace non-exact exon match coordinates with event coordinates in included sets,
#' and skip them in skipped sets
#' @return GRanges with transcripts skipping exons
#' @export
#' @import GenomicRanges
#' @importFrom plyr desc
#' @importFrom rtracklayer import
#' @family whippet splicing isoform creation
#' @author Beth Signal
#' @examples
#' gtf <- rtracklayer::import(system.file("extdata", "gencode.vM25.small.gtf", package = "GeneStructureTools"))
#' exons <- gtf[gtf$type == "exon"]
#' g <- BSgenome.Mmusculus.UCSC.mm10::BSgenome.Mmusculus.UCSC.mm10
#'
#' whippetFiles <- system.file("extdata", "whippet_small/",
#' package = "GeneStructureTools"
#' )
#' wds <- readWhippetDataSet(whippetFiles)
#'
#' wds.exonSkip <- filterWhippetEvents(wds, eventTypes = "CE", psiDelta = 0.2)
#' exons.exonSkip <- findExonContainingTranscripts(wds.exonSkip, exons,
#' variableWidth = 0, findIntrons = FALSE
#' )
#' ExonSkippingTranscripts <- skipExonInTranscript(exons.exonSkip, exons, whippetDataSet = wds.exonSkip)
#'
#' exonFromGRanges <- exons[exons$exon_id == "ENSMUSE00000414559.2"]
#' exons.exonSkip <- findExonContainingTranscripts(exonFromGRanges, exons,
#' variableWidth = 0, findIntrons = FALSE
#' )
#' ExonSkippingTranscripts <- skipExonInTranscript(exons.exonSkip, exons, match = "skip")
skipExonInTranscript <- function(skippedExons,
exons,
glueExons = TRUE,
whippetDataSet = NULL,
match = "exact") {
if (!(match %in% c("exact", "skip", "replace"))) {
message("match must be 'exact', 'skip', or 'replace'")
if (!is.null(whippetDataSet)) {
message("using match = 'exact'")
match <- "exact"
} else {
message("using match = 'skip'")
match <- "skip"
}
}
if (is.null(whippetDataSet) & match == "exact") {
message("cannot use match = 'exact' without a whippetDataSet")
message("using match = 'skip'")
match <- "skip"
}
# if a whippet dataset is being used
if (!is.null(whippetDataSet)) {
# check all are CE
whippetDataSet <- filterWhippetEvents(whippetDataSet,
probability = 0,
psiDelta = 0,
eventTypes = "CE"
)
eventCoords <- coordinates(whippetDataSet)
# remove non-exact matches
if (match == "exact") {
m <- match(skippedExons$alt_id, eventCoords$id)
keep <- which((skippedExons$start) == start(eventCoords)[m] &
(skippedExons$end == end(eventCoords)[m]) &
skippedExons$overlaps == "exon")
skippedExons <- skippedExons[keep, ]
}
eventCoords <- eventCoords[match(skippedExons$alt_id, eventCoords$id)]
} else {
m <- match(skippedExons$gene_id, exons$gene_id)
eventCoords <- GRanges(
seqnames = seqnames(exons[m]),
ranges = IRanges(
start = skippedExons$start,
end = skippedExons$end
),
strand = strand(exons[m]),
id = skippedExons$alt_id
)
}
eventCoords <- eventCoords[match(skippedExons$alt_id, eventCoords$id)]
eventCoords$exon_number <- skippedExons$exon_number
eventCoords$transcript_id <- skippedExons$transcript_id
eventCoords$transcript_type <- skippedExons$transcript_type
eventCoords$gene_id <- skippedExons$gene_id
eventCoords$exon_id <- eventCoords$id
eventCoords$overlaps <- skippedExons$overlaps
# replace exon coordinates with the event coordinates
if (match == "replace") {
oldStarts <- start(eventCoords)
oldEnds <- end(eventCoords)
}
ranges(eventCoords) <-
IRanges::IRanges(start = skippedExons$start, end = skippedExons$end)
# transcripts containing the exon
transcripts <- as.data.frame(table(skippedExons$transcript_id))
gtfTranscripts <- exons[exons$transcript_id %in% transcripts$Var1]
m <- match(gtfTranscripts$transcript_id, eventCoords$transcript_id)
mcols(gtfTranscripts) <- cbind(
mcols(gtfTranscripts),
DataFrame(new_transcript_id = paste0(
gtfTranscripts$transcript_id, "+AS ",
eventCoords$exon_id[m]
))
)
mcols(eventCoords) <- cbind(
mcols(eventCoords),
DataFrame(new_transcript_id = paste0(
eventCoords$transcript_id, "+AS ",
eventCoords$exon_id
))
)
mcols(gtfTranscripts) <- mcols(
gtfTranscripts
)[, c(
"gene_id", "transcript_id",
"transcript_type", "exon_id",
"exon_number", "new_transcript_id"
)]
mcols(eventCoords) <- mcols(
eventCoords
)[, c(
"gene_id", "transcript_id",
"transcript_type", "exon_id",
"exon_number", "new_transcript_id",
"overlaps"
)]
needsDuplicated <- which(!(eventCoords$new_transcript_id %in%
gtfTranscripts$new_transcript_id))
if (length(needsDuplicated) > 0) {
gtfTranscripts_add <- gtfTranscripts[
gtfTranscripts$transcript_id %in%
eventCoords$transcript_id[needsDuplicated]
]
}
while (length(needsDuplicated) > 0) {
gtfTranscripts_add <- gtfTranscripts_add[
gtfTranscripts_add$transcript_id %in%
eventCoords$transcript_id[needsDuplicated]
]
m <- match(
gtfTranscripts_add$transcript_id,
eventCoords$transcript_id[needsDuplicated]
)
gtfTranscripts_add$new_transcript_id <- paste0(
gtfTranscripts_add$transcript_id, "+AS ",
eventCoords$exon_id[needsDuplicated][m]
)
gtfTranscripts <- c(gtfTranscripts, gtfTranscripts_add)
needsDuplicated <- which(!(eventCoords$new_transcript_id %in%
gtfTranscripts$new_transcript_id))
}
exon_names <- with(
as.data.frame(gtfTranscripts),
paste0(seqnames, ":", start, "-", end)
)
ol <- findOverlaps(gtfTranscripts, eventCoords, type = "equal")
ol <- as.data.frame(ol)
ol$gtf_trans_id <- gtfTranscripts$new_transcript_id[ol$queryHits]
ol$eventCoords_id <- eventCoords$new_transcript_id[ol$subjectHits]
ol <- ol[ol$gtf_trans_id == ol$eventCoords_id, ]
# add/remove exons from skipped isoforms if introns are used
if (match != "exact" & any(eventCoords$overlaps != "exon")) {
ol.var <- findOverlaps(
gtfTranscripts,
eventCoords[eventCoords$overlaps != "exon"]
)
ol.var <- as.data.frame(ol.var)
ol.var$gtf_trans_id <-
gtfTranscripts$new_transcript_id[ol.var$queryHits]
ol.var$eventCoords_id <- eventCoords$new_transcript_id[
which(eventCoords$overlaps != "exon")
][ol.var$subjectHits]
ol.var <- ol.var[ol.var$gtf_trans_id == ol.var$eventCoords_id, ]
if (nrow(ol.var) > 0) {
ol <- rbind(ol, ol.var)
}
}
# remove the skipped exon
rm <- unique(ol$queryHits)
gtfTranscripts.rm <- gtfTranscripts[-rm]
mcols(gtfTranscripts.rm) <- mcols(
gtfTranscripts.rm
)[, c(
"gene_id", "new_transcript_id",
"transcript_type", "exon_id",
"exon_number"
)]
colnames(mcols(gtfTranscripts.rm))[2] <- "transcript_id"
mcols(eventCoords) <- mcols(
eventCoords
)[, c(
"gene_id", "new_transcript_id",
"transcript_type", "exon_id",
"exon_number", "overlaps"
)]
colnames(mcols(eventCoords))[2] <- "transcript_id"
gtfTranscripts.rm$exon_number <- as.numeric(gtfTranscripts.rm$exon_number)
order <- order(
gtfTranscripts.rm$transcript_id,
gtfTranscripts.rm$exon_number
)
gtfTranscripts.rm <- gtfTranscripts.rm[order]
gtfTranscripts.rm$overlaps <- eventCoords$overlaps[
match(gtfTranscripts.rm$transcript_id, eventCoords$new_transcript_id)
]
# add skipped exon back in
gtfTranscripts.withExon <- gtfTranscripts.rm
# replace with event coordinates
if (match == "replace") {
start(eventCoords) <- oldStarts
end(eventCoords) <- oldEnds
}
gtfTranscripts.withExon <- c(gtfTranscripts.withExon, eventCoords)
gtfTranscripts.withExon <- reorderExonNumbers(gtfTranscripts.withExon)
gtfTranscripts.rm$set <- "skipped_exon"
gtfTranscripts.withExon$set <- "included_exon"
gtfTranscripts.rm <- c(gtfTranscripts.rm, gtfTranscripts.withExon)
# rename skipped/included isoforms
gtfTranscripts.rm$transcript_id[which(
gtfTranscripts.rm$set == "skipped_exon"
)] <-
gsub("AS", "ASSE", gtfTranscripts.rm$transcript_id[
which(gtfTranscripts.rm$set == "skipped_exon")
])
gtfTranscripts.rm$transcript_id[which(
gtfTranscripts.rm$set == "included_exon"
)] <-
gsub("AS", "ASIE", gtfTranscripts.rm$transcript_id[
which(gtfTranscripts.rm$set == "included_exon")
])
# join together exons that are not seperated by an exon
if (glueExons == TRUE) {
# split pos/neg ordering
gtfTranscripts.rm.neg <-
gtfTranscripts.rm[which(as.logical(strand(gtfTranscripts.rm) ==
"-"))]
order <- order(
gtfTranscripts.rm.neg$transcript_id,
plyr::desc(gtfTranscripts.rm.neg$exon_number)
)
gtfTranscripts.rm.neg <- gtfTranscripts.rm.neg[order]
gtfTranscripts.rm.pos <-
gtfTranscripts.rm[which(as.logical(strand(gtfTranscripts.rm) ==
"+"))]
gtfTranscripts.rm <- c(gtfTranscripts.rm.pos, gtfTranscripts.rm.neg)
# extend starts <---<---<---
w <- which(end(ranges(gtfTranscripts.rm))[-length(gtfTranscripts.rm)] ==
start(ranges(gtfTranscripts.rm[-1])))
gtfTranscripts.rm <- gtfTranscripts.rm
GenomicRanges::start(GenomicRanges::ranges(gtfTranscripts.rm))[w + 1] <-
GenomicRanges::start(GenomicRanges::ranges(gtfTranscripts.rm))[w]
# remove exons that cover now redundant regions
overlaps <- findOverlaps(gtfTranscripts.rm, type = "within")
overlaps <- overlaps[overlaps@from == overlaps@to - 1]
rm <- unique(overlaps@from)
if (length(rm) > 0) {
gtfTranscripts.rm <- gtfTranscripts.rm[-rm]
}
# extend ends --->--->--->
overlaps <- findOverlaps(gtfTranscripts.rm)
overlaps <- overlaps[overlaps@from == overlaps@to + 1]
GenomicRanges::end(GenomicRanges::ranges(
gtfTranscripts.rm
))[overlaps@to] <-
GenomicRanges::end(GenomicRanges::ranges(
gtfTranscripts.rm
))[overlaps@from]
# remove exons that cover now redundant regions
overlaps <- findOverlaps(gtfTranscripts.rm, type = "within")
overlaps <- overlaps[overlaps@from == overlaps@to + 1]
rm <- unique(overlaps@from)
if (length(rm) > 0) {
gtfTranscripts.rm <- gtfTranscripts.rm[-rm]
}
}
gtfTranscripts.rm$event_id <- unlist(lapply(stringr::str_split(
gtfTranscripts.rm$transcript_id, " "
), "[[", 2))
gtfTranscripts.rm$overlaps <- NULL
return(gtfTranscripts.rm)
}
#' Reorder the exon numbers in a gtf annotation
#' @param exons GRanges object made from a GTF with ONLY exon annotations
#' (no gene, transcript, CDS etc.)
#' @param by what column are the transcripts grouped by?
#' @return The same input GRanges, but with exon numbers reordered.
#' @export
#' @import GenomicRanges
#' @importFrom rtracklayer import
#' @family gtf manipulation
#' @author Beth Signal
#' @examples
#' gtf <- rtracklayer::import(system.file("extdata", "gencode.vM25.small.gtf", package = "GeneStructureTools"))
#' exons <- gtf[gtf$type == "exon"]
#' exons <- reorderExonNumbers(exons)
reorderExonNumbers <- function(exons, by = "transcript_id") {
n <- which(colnames(mcols(exons)) == by)
order <- order(mcols(exons)[, n], start(exons))
exons <- exons[order]
transcriptTable <- as.data.frame(table(mcols(exons)[, n]))
transcriptTable$strand <- as.character(strand(exons[match(
transcriptTable$Var1,
mcols(exons)[, n]
)]))
exons$exon_number <-
as.numeric(unlist(apply(
transcriptTable, 1,
function(x) {
if (x[3] == "+") {
c(1:(x[2]))
} else {
c((x[2]:1))
}
}
)))
return(exons)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.