#' Container to hold raw data for SpliceWiz coverage plots
#'
#' This object is generated using getCoverageData or getGenomeData methods, and
#' is used as input for generating coverage plots.
#' @param se A \linkS4class{NxtSE} object, created by [makeSE].
#' COV files must be linked to the NxtSE object. To do this, see the example
#' in [makeSE]. Required by `plotCoverage`. Not required by `plotGenome` if
#' `reference_path` is supplied.
#' @param reference_path The path of the reference generated by
#' [Build-Reference-methods]. Required by `plotGenome` if a
#' \linkS4class{NxtSE} object is not specified.
#' @param Event The `EventName` of the IR / alternative splicing event to be
#' displayed. Use `rownames(se)` to display a list of valid events.
#' @param Gene Whether to use the range for the given `Gene`. If given,
#' overrides `Event` (but `Event` or `norm_event` will be used to normalise by
#' condition). Valid `Gene` entries include gene_id (Ensembl ID) or gene_name
#' (Gene Symbol).
#' @param seqname,start,end The chromosome (string) and genomic `start/end`
#' coordinates (numeric) of the region to display. If present, overrides both
#' `Event` and `Gene`. E.g. for a given region of chr1:10000-11000,
#' use the parameters: `seqname = "chr1", start = 10000, end = 11000`
#' @param coordinates A string specifying genomic coordinates can be given
#' instead of `seqname,start,end`. Must be of the format "chr:start-end", e.g.
#' "chr1:10000-11000"
#' @param strand Whether to show coverage of both strands "*" (default), or
#' from the "+" or "-" strand only.
#' @param zoom_factor Zoom out from event. Each level of zoom zooms out by a
#' factor of 3. E.g. for a query region of chr1:10000-11000, if a
#' `zoom_factor` of 1.0 is given, chr1:99000-12000 will be displayed.
#' @param tracks The names of individual samples,
#' or the names of the different conditions to be plotted. For the latter, set
#' `condition` to the specified condition category.
#' @param condition To display normalised coverage per condition, set this to
#' the condition category. If omitted, `tracks` are assumed to refer to the
#' names of individual samples.
#' @param bases_flanking (Default = `100`) How many bases flanking the zoomed
#' window. Useful when
#' used in conjunction with zoom_factor == 0. E.g. for a given region of
#' chr1:10000-11000, if `zoom_factor = 0` and `bases_flanking = 100`, the
#' region chr1:9900-11100 will be displayed.
#' @param object For `plotAnnoTrack()`, the `covDataObject` created by
#' `getCoverageData()` or `getGenomeData()`
#' @param view_start,view_end Start and end coordinates of plotting function.
#' Note that plot coordinates may be different from retrieval coordinates
#' and is useful for zooming in.
#' @param reverseGenomeCoords Whether the genomic axis should be reversed
#' to make it more convenient to plot reverse-stranded genes
#' @param condensed (default `FALSE)
#' Whether the genomic track should be condensed to plot whole
#' genes, rather than transcripts. Preferred if multiple genes are plotted
#' on a zoomed-out plot
#' @param selected_transcripts (default `""`) One or more transcript names or
#' ID's to be displayed on the annotation track.
#' @param plot_key_isoforms (default `FALSE`) If `TRUE`, plots only transcripts
#' involved in the given splicing `Event`.
#' @param usePlotly (default `FALSE`)
#' Whether to return a plotly or ggplot object.
#' @param ... Ignored / not used
#' @return
#' For getCoverageData(): A covDataObject containing required data used to
#' generate downstream
#' For plotAnnoTrack(): A ggplot or plotly object
#' @examples
#' se <- SpliceWiz_example_NxtSE(novelSplicing = TRUE)
#'
#' # Assign annotation of the experimental conditions
#' colData(se)$treatment <- rep(c("A", "B"), each = 3)
#'
#' dataObj <- getCoverageData(
#' se,
#' Event = "SE:SRSF3-203-exon4;SRSF3-202-int3",
#' tracks = colnames(se)
#' )
#'
#' # Show `EventName`s of supported splicing events
#' # contained within covDataObject
#'
#' showEvents(dataObj)
#'
#' # A limited covDataObject containing only the reference can be generated
#' # from the SpliceWiz reference
#'
#' buildRef(
#' reference_path = file.path(tempdir(), "Reference"),
#' fasta = chrZ_genome(),
#' gtf = chrZ_gtf()
#' )
#'
#' genomeObj <- getGenomeData(
#' reference_path = file.path(tempdir(), "Reference"),
#' Gene = "SRSF3"
#' )
#'
#' # Plot reference track directly from the covDataObject
#'
#' # NB: Event plotting is not supported for reference-derived `covDataObject`s
#' plotAnnoTrack(genomeObj)
#'
#' plotAnnoTrack(dataObj, Event = "SE:SRSF3-203-exon4;SRSF3-202-int3")
#'
#' @name covDataObject-class
#' @aliases
#' showEvents showEvents,covDataObject-method
#' @seealso [covPlotObject-class]
#' @md
NULL
covDataObject <- function(
args = list(),
annotation = list(),
colData = data.frame(),
covData = list(),
juncData = list(),
normData = list()
) {
newCovObj <- new("covDataObject",
args = args,
annotation = annotation,
colData = colData,
covData = covData, juncData = juncData,
normData = normData
)
newCovObj
}
#' @describeIn covDataObject-class Returns the EventNames for which events can
#' be normalized using the given covDataObject
#' @export
setMethod("showEvents", c(object = "covDataObject"), function(
object
) {
return(object@normData$rowData$EventName)
})
#' @describeIn covDataObject-class Get coverage / genome data for plotting
#' coverage plots
#' @export
getCoverageData <- function(
se,
# Info for determining genomic locus
Event, Gene,
seqname, start, end,
coordinates,
strand = c("*", "+", "-"),
# Amplifiers - for initial plot view start/end
zoom_factor = 0.2,
bases_flanking = 100,
# Samples to retrieve
tracks,
condition,
...
) {
# Validate mandatory arguments:
if(!is(se, "NxtSE")) .log(paste(
"In getCoverageData(),",
"`se` must be a NxtSE object"
))
args <- as.list(match.call())
args[[1]] <- NULL
args[["se"]] <- NULL
for(argname in names(args)) {
args[[argname]] <- eval(args[[argname]], parent.frame())
}
args[["se"]] <- se
args[["cov_data"]] <- ref(se)
args[["zoom_factor"]] <- zoom_factor
args[["bases_flanking"]] <- bases_flanking
# Interpret coordinates
if (
!all(c("seqname", "start", "end") %in% names(args)) &
"coordinates" %in% names(args)
) {
args[["seqname"]] <- tstrsplit(coordinates,
split = ":", fixed=TRUE)[[1]]
rangetxt <- tstrsplit(coordinates, split = ":", fixed=TRUE)[[2]]
rangetxt <- tstrsplit(rangetxt, split = "/", fixed=TRUE)[[1]]
args[["start"]] <- as.numeric(tstrsplit(
rangetxt, split = "-", fixed=TRUE)[[1]])
args[["end"]] <- as.numeric(tstrsplit(
rangetxt, split = "-", fixed=TRUE)[[2]])
}
strand <- match.arg(strand)
if(!is_valid(strand)) strand <- "*"
args[["strand"]] <- strand
N_steps <- 5
dash_progress("Validating arguments", N_steps)
# Evaluate view_chr, view_start, view_end, and check arguments
args <- .gCD_validate_args(args)
# Retrieve colData
colData <- .gCD_retrieve_colData(args)
dash_progress("Retrieving ASE normalization values", N_steps)
# Retrieve in-range events and corresponding normalization values
normData <- .gCD_retrieve_norms(args, colData)
dash_progress("Retrieving gene annotations", N_steps)
# Retrieve genome annotations
annotation <- .gCD_retrieve_annotations(args)
dash_progress("Retrieving coverage data", N_steps)
raw_cov <- .gCD_retrieve_raw_cov(args, colData)
dash_progress("Retrieving junction counts", N_steps)
juncData <- .gCD_retrieve_junc(args, colData)
# Remove large objects prior to returning
args[["se"]] <- args[["cov_data"]] <- NULL
return(covDataObject(
args = args,
annotation = annotation,
colData = colData,
covData = raw_cov,
juncData = juncData,
normData = normData
))
}
#' @describeIn covDataObject-class Get coverage / genome data for plotting
#' coverage plots
#' @export
getGenomeData <- function(
reference_path,
Gene,
seqname, start, end,
coordinates,
zoom_factor = 0.2,
bases_flanking = 100,
...
) {
if (!file.exists(file.path(reference_path, "cov_data.Rds")))
.log("Given reference_path is not a valid SpliceWiz reference")
args <- as.list(match.call())
args[[1]] <- NULL
for(argname in names(args)) {
args[[argname]] <- eval(args[[argname]], parent.frame())
}
args[["cov_data"]] <- readRDS(file.path(reference_path, "cov_data.Rds"))
# Interpret coordinates
args <- .gCD_validate_args_coords(args)
args <- .gCD_validate_args(args)
annotation <- .gCD_retrieve_annotations(args)
args[["cov_data"]] <- NULL
return(covDataObject(
args = args,
annotation = annotation
))
}
################################################################################
# Internals
.gCD_validate_args <- function(args) {
if(!("reference_path" %in% names(args))) .gCD_validate_args_se(args)
return(.gCD_validate_args_loci(args))
}
.gCD_validate_args_se <- function(args) {
if (!("se" %in% names(args)) || !is(args[["se"]], "NxtSE"))
.log("In getCoverageData, `se` must be a valid `NxtSE` object")
# Warn if old NxtSE object
if(!("row_gr" %in% names(metadata(args[["se"]]))))
.log(paste(
"In getCoverageData,",
"we recommend updating `se` to the latest version",
"using update_NxtSE()"
), "warning")
if (!isCOV(covfile(args[["se"]])))
.log(paste("In getCoverageData,",
"No valid COV files defined in se.",
"Please supply the correct paths of the COV files",
"using covfile(se) <- vector_of_correct_COVfile_paths"))
# Check condition and tracks
# - don't check number of tracks - irrelevant at this stage
# - check condition is of length 1
if("condition" %in% names(args)) {
if(length(args[["condition"]]) != 1)
.log(paste("In getCoverageData,", "condition must be of length 1"))
if(!(args[["condition"]] %in% names(colData(args[["se"]]))))
.log(paste("In getCoverageData,",
"condition must be a valid column name in colData(se)"))
if(!("tracks" %in% names(args)))
.log(paste("In getCoverageData,",
"condition is defined but tracks is missing"))
condition_options <- unique(
colData(args[["se"]])[, args[["condition"]]]
)
if (!all(args[["tracks"]] %in% condition_options))
.log(paste("In getCoverageData,",
"some tracks do not match valid condition names in",
args[["condition"]]))
} else if("tracks" %in% names(args)) {
if (!all(args[["tracks"]] %in% colnames(args[["se"]])))
.log(paste("In getCoverageData,",
"some tracks do not match valid sample names in se"))
}
}
.gCD_validate_args_coords <- function(args) {
# Interpret coordinates
if (
!all(c("seqname", "start", "end") %in% names(args)) &
"coordinates" %in% names(args)
) {
args[["seqname"]] <- tstrsplit(args[["coordinates"]],
split = ":", fixed=TRUE)[[1]]
rangetxt <- tstrsplit(args[["coordinates"]],
split = ":", fixed=TRUE)[[2]]
rangetxt <- tstrsplit(rangetxt, split = "/", fixed=TRUE)[[1]]
args[["start"]] <- as.numeric(tstrsplit(
rangetxt, split = "-", fixed=TRUE)[[1]])
args[["end"]] <- as.numeric(tstrsplit(
rangetxt, split = "-", fixed=TRUE)[[2]])
}
return(args)
}
# Checks Gene and loci. Only this is run if plotGenome is run
.gCD_validate_args_loci <- function(args) {
if (!("se" %in% names(args)) || !is(args[["se"]], "NxtSE")) {
if(!("cov_data" %in% names(args)))
.log("In getGenomeData, `cov_data` is missing")
cov_data <- args[["cov_data"]]
} else {
cov_data <- ref(args[["se"]])
}
args[["seqInfo"]] <- cov_data$seqInfo
# Check we know where to plot
if (
!any(c("Event", "Gene") %in% names(args)) &
!all(c("seqname", "start", "end") %in% names(args))
) {
.log(paste("In getCoverageData / getGenomeData,",
"Event or Gene cannot be empty, unless coordinates are provided"))
} else if (all(c("seqname", "start", "end") %in% names(args))) {
args[["view_chr"]] <- as.character(args[["seqname"]])
args[["view_start"]] <- args[["start"]]
args[["view_end"]] <- args[["end"]]
} else if ("Gene" %in% names(args)) {
if (
!(args[["Gene"]] %in% cov_data$geneList$gene_id) &
!(args[["Gene"]] %in% cov_data$geneList$gene_name)
) {
.log(paste("In getCoverageData / getGenomeData,",
args[["Gene"]], "is not a valid gene symbol or gene_id"))
}
if (!(args[["Gene"]] %in% cov_data$geneList$gene_id)) {
gene.df <- as.data.frame(
cov_data$geneList[get("gene_name") == args[["Gene"]]])
if (nrow(gene.df) != 1) {
.log(paste("In getCoverageData / getGenomeData,",
args[["Gene"]],
"is an ambiguous name referring to 2 or more genes.",
"Please provide its gene_id instead"))
}
} else {
gene.df <- as.data.frame(
cov_data$geneList[get("gene_id") == args[["Gene"]]])
if (nrow(gene.df) != 1) {
.log(paste("In getCoverageData / getGenomeData,",
args[["Gene"]],
"is an ambiguous gene_id referring to 2 or more genes."))
}
}
args[["view_chr"]] <- as.character(gene.df$seqnames)
args[["view_start"]] <- gene.df$start
args[["view_end"]] <- gene.df$end
} else if(!("Event" %in% names(args))) {
.log(paste("In getCoverageData,",
"Event must be given if no other loci information is given"))
} else if(!("se" %in% names(args))){
.log(paste("In getGenomeData,",
"Event cannot be used to locate genomic loci"))
} else {
# Only "Event" given - check this at later step)
return(.gCD_validate_args_event(args))
}
# determine params here - TODO
return(.gCD_determine_loci(args))
}
# Checks whether Event given is valid.
.gCD_validate_args_event <- function(args) {
cov_data <- ref(args[["se"]])
if (!(args[["Event"]] %in% rownames(args[["se"]])))
.log(paste("In getCoverageData,", args[["Event"]],
"is not a valid IR or alternate splicing event in rowData(se)"))
rowData <- rowData(args[["se"]])[args[["Event"]], ]
if(nrow(rowData) > 1) rowData <- rowData[1,]
args[["view_chr"]] <- tstrsplit(rowData$EventRegion, split = ":")[[1]]
temp1 <- tstrsplit(rowData$EventRegion, split = "/")
temp2 <- tstrsplit(temp1[[1]], split = ":")[[2]]
args[["view_start"]] <- as.numeric(tstrsplit(temp2, split = "-")[[1]])
args[["view_end"]] <- as.numeric(tstrsplit(temp2, split = "-")[[2]])
return(.gCD_determine_loci(args))
}
.gCD_determine_loci <- function(args) {
if (!(args[["view_chr"]] %in% names(args[["seqInfo"]])))
.log(paste("In getCoverageData / getGenomeData,",
args[["view_chr"]],
"is not a valid chromosome reference name in the given genome"))
if ("bases_flanking" %in% names(args)) {
if(
!is.numeric(args[["bases_flanking"]]) ||
args[["bases_flanking"]] < 0
) .log(paste("In getCoverageData / getGenomeData,",
"bases_flanking must be a non-negative number"))
} else {
args[["bases_flanking"]] <- 0
}
view_length <- args[["view_end"]] - args[["view_start"]]
if (!is.numeric(view_length) || view_length < 0)
.log(paste("In getCoverageData / getGenomeData,",
"view_length must be a non-negative number"))
if (("zoom_factor" %in% names(args))) {
tryCatch({
args[["zoom_factor"]] <- as.numeric(args[["zoom_factor"]])
}, error = function(e) {
args[["zoom_factor"]] <- 0
})
} else {
args[["zoom_factor"]] <- 0
}
view_center <- (args[["view_start"]] + args[["view_end"]]) / 2
new_view_length <- view_length * 3^args[["zoom_factor"]] +
2 * args[["bases_flanking"]]
view_start <- round(view_center - new_view_length / 2)
view_end <- round(view_center + new_view_length / 2)
# Validate genomic window and shift if invalid
if (view_start < 1) view_start <- 1
seqInfo <- args[["seqInfo"]][args[["view_chr"]]]
seqmax <- GenomeInfoDb::seqlengths(seqInfo)
if(seqmax < 3) .log(paste(
"In getCoverageData / getGenomeData",
"chromosome", args[["view_chr"]], "has sequence length of", seqmax,
"which is not allowed"
))
if (view_end > seqmax) {
view_end <- seqmax - 1
view_start <- view_end - new_view_length
if(view_start < 1) view_start <- 1
}
args[["view_start"]] <- view_start
args[["view_end"]] <- view_end
args <- .gCD_determine_limits(args)
return(args)
}
.gCD_determine_limits <- function(args) {
# Get fetch limits of coverage and annotations, based on a 5-fold window
# of view start/end
view_start <- args[["view_start"]]
view_end <- args[["view_end"]]
seqInfo <- args[["seqInfo"]][args[["view_chr"]]]
seqmax <- GenomeInfoDb::seqlengths(seqInfo)
limit_start <- view_start - 2 * (view_end - view_start)
limit_end <- view_end + 2 * (view_end - view_start)
limit_span <- limit_end - limit_start
if(limit_span > seqmax - 1) limit_span <- seqmax - 2
if(limit_start < 1) {
limit_start <- 1
limit_end <- limit_start + limit_span
}
if (limit_end > seqmax) {
limit_end <- seqmax - 1
limit_start <- limit_end - limit_span
if(limit_start < 1) limit_start <- 1
}
args[["limit_start"]] <- limit_start
args[["limit_end"]] <- limit_end
return(args)
}
################################################################################
# Retrieve relevant colData
.gCD_retrieve_colData <- function(args) {
if(!("tracks" %in% names(args))) {
return(data.frame())
}
tracks <- args[["tracks"]]
colData <- as.data.frame(colData(args[["se"]]))
sampleNames <- colnames(args[["se"]])
if("condition" %in% names(args)) {
cond <- args[["condition"]]
condStates <- colData[, cond]
samples <- sampleNames[condStates %in% tracks]
colData <- colData[samples %in% sampleNames, cond, drop = FALSE]
} else {
colData <- colData[args[["tracks"]] %in% sampleNames, , drop = FALSE]
}
return(colData)
}
################################################################################
# Retrieve relevant raw coverage
.gCD_retrieve_raw_cov <- function(args, colData) {
samples <- rownames(colData)
covfiles <- covfile(args[["se"]])[samples]
session <- shiny::getDefaultReactiveDomain()
if(!is.null(session)) {
shiny::withProgress(message = "Reading COV files", {
raw_cov <- .gCD_retrieve_raw_cov_helper(args, covfiles, samples)
})
} else {
raw_cov <- .gCD_retrieve_raw_cov_helper(args, covfiles, samples)
}
return(raw_cov)
}
.gCD_retrieve_raw_cov_helper <- function(args, covfiles, samples) {
pos <- list()
neg <- list()
uns <- list()
for(i in seq_len(length(samples))) {
dash_progress("Reading COV files", length(samples))
pos[[i]] <- getCoverage(
file = covfiles[i],
seqname = args[["view_chr"]],
start = args[["limit_start"]] - 1, end = args[["limit_end"]],
strand = c("+")
)
neg[[i]] <- getCoverage(
file = covfiles[i],
seqname = args[["view_chr"]],
start = args[["limit_start"]] - 1, end = args[["limit_end"]],
strand = c("-")
)
uns[[i]] <- getCoverage(
file = covfiles[i],
seqname = args[["view_chr"]],
start = args[["limit_start"]] - 1, end = args[["limit_end"]],
strand = c("*")
)
}
names(pos) <- names(neg) <- names(uns) <- samples
return(list(
pos = pos,
neg = neg,
uns = uns
))
}
# Retrieve relevant raw coverage
.gCD_retrieve_junc <- function(args, colData) {
samples <- rownames(colData)
# Retrieve a list of in-range events
view_gr <- GRanges(
seqnames = args[["view_chr"]],
ranges = IRanges(
start = args[["limit_start"]], end = args[["limit_end"]]
),
strand = "*"
)
OL <- findOverlaps(junc_gr(args[["se"]]), view_gr)
junc_counts <- junc_counts(args[["se"]])[
unique(from(OL)),
samples
]
junc_counts_uns <- junc_counts_uns(args[["se"]])[
unique(from(OL)),
samples
]
junc_PSI <- junc_PSI(args[["se"]])[
unique(from(OL)),
samples
]
return(list(
junc_gr = junc_gr(args[["se"]])[unique(from(OL))],
junc_counts = as.matrix(junc_counts),
junc_counts_uns = as.matrix(junc_counts_uns),
junc_PSI = as.matrix(junc_PSI)
))
}
################################################################################
# Retrieve in-range events and corresponding normalization values
.gCD_retrieve_norms <- function(args, colData) {
# Retrieve a list of in-range events
limit_gr <- GRanges(
seqnames = args[["view_chr"]],
ranges = IRanges(
start = args[["limit_start"]], end = args[["limit_end"]]
),
strand = "*"
)
OL <- findOverlaps(row_gr(args[["se"]]), limit_gr, type = "within")
event_gr <- row_gr(args[["se"]])[unique(from(OL))]
names(event_gr) <- rownames(args[["se"]])[unique(from(OL))]
norms <- as.matrix(assay(args[["se"]], "Depth")[
names(event_gr),
rownames(colData),
drop = FALSE
])
rowData <- rowData(args[["se"]])[unique(from(OL)), ]
rowData <- as.data.frame(rowData)
# Get strand data
sampleStrand <- sampleQC(args[["se"]][, rownames(colData)])$strand
names(sampleStrand) <- rownames(colData)
return(list(
event_gr = event_gr,
norms = norms,
rowData = rowData,
sampleStrand = sampleStrand
))
}
################################################################################
# Retrieve relevant annotations from cov_data
.gCD_retrieve_annotations <- function(args) {
view_chr <- args[["view_chr"]]
view_start <- args[["limit_start"]]
view_end <- args[["limit_end"]]
transcripts.DT <- args[["cov_data"]]$transcripts[
get("seqnames") == view_chr &
get("start") <= view_end + (view_end - view_start) &
get("end") >= view_start - (view_end - view_start)
]
setorderv(transcripts.DT, c("transcript_support_level", "width"))
reduced.DT <- args[["cov_data"]]$elements[
get("transcript_id") %in% transcripts.DT$transcript_id]
reduced.DT <- reduced.DT[get("type") %in% c("CDS", "exon", "intron")]
# Transfer feature_id from exons -> CDS
CDS.DT <- reduced.DT[get("type") == "CDS"]
if(nrow(CDS.DT) > 0) {
exons.DT <- reduced.DT[get("type") == "exon"]
introns.DT <- reduced.DT[get("type") == "intron"]
exons.gr <- .grDT(exons.DT)
CDS.gr <- .grDT(CDS.DT)
OL <- findOverlaps(exons.gr, CDS.gr)
OL.DT <- data.table(
from = OL@from, to = OL@to
)
OL.DT[, c("feature_id", "exon_trid", "cds_trid") := list(
exons.DT$feature_id[get("from")],
exons.DT$transcript_id[get("from")],
CDS.DT$transcript_id[get("to")]
)]
OL.DT <- OL.DT[get("exon_trid") == get("cds_trid")]
CDS.DT$feature_id[OL.DT$to] <- OL.DT$feature_id
reduced.DT <- rbind(exons.DT, CDS.DT, introns.DT)
}
transcripts.DT <- transcripts.DT[
get("transcript_id") %in% reduced.DT$transcript_id]
return(list(
transcripts.DT = transcripts.DT,
reduced.DT = reduced.DT
))
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.