# Exported wrapper functions to plot coverage plots from command line:
#' RNA-seq Coverage Plots and Genome Tracks
#'
#' Generate plotly / ggplot RNA-seq genome and coverage plots from command line.
#' For some quick working examples, see the Examples section below.
#'
#' @details
#' In RNA sequencing, alignments to spliced transcripts will "skip" over genomic
#' regions of introns. This can be illustrated in a plot using a horizontal
#' genomic axis, with the vertical axis representing the number of alignments
#' covering each nucleotide. As a result, the coverage "hills" represent the
#' expression of exons, and "valleys" to introns.
#'
#' Different alternatively-spliced isoforms thus produce different coverage
#' patterns. The change in the coverage across an alternate exon relative to its
#' constitutively-included flanking exons, for example, represents its
#' alternative inclusion or skipping. Similarly, elevation of intron
#' valleys represent increased intron retention.
#'
#' With multiple replicates per sample, coverage is dependent on
#' library size and gene expression. To compare
#' alternative splicing ratios, normalisation of the coverage of the alternate
#' exon (or alternatively retained intron) relative to their constitutive
#' flanking exons, is required. There is no established method for this
#' normalisation, and can be confounded in situations where flanking elements
#' are themselves alternatively spliced.
#'
#' NxtIRF performs this coverage normalisation using the same method as its
#' estimate of spliced / intronic transcript abundance using the `SpliceOverMax`
#' method (see details section in [CollateData]). This normalisation can be
#' applied to correct for library size and gene expression differences between
#' samples of the same experimental condition. After normalisation, mean and
#' variance of coverage can be computed as ratios relative to total transcript
#' abundance. This method can visualise alternatively included genomic regions
#' including casette exons, alternate splice site usage, and intron retention.
#'
#' `Plot_Coverage` generates plots showing depth of alignments to
#' the genomic axis. Plots can be generated for individual samples or samples
#' grouped by experimental conditions. In the latter, mean and 95% confidence
#' intervals are shown.
#'
#' `Plot_Genome` generates genome transcript tracks only. Protein-coding
#' regions are denoted by thick rectangles, whereas non-protein coding
#' transcripts or untranslated regions are denoted with thin rectangles.
#' Introns are denoted as lines.
#'
#' @param se A \linkS4class{NxtSE} object, created by [MakeSE].
#' COV files must be linked to the NxtSE object. To do this, see the example
#' in [MakeSE]. Required by `Plot_Coverage`.
#' @param reference_path The path of the reference generated by
#' [BuildReference]. Required by `Plot_Genome` if a \linkS4class{NxtSE}
#' object is not specified.
#' @param Event The `EventName` of the IR / alternative splicing event to be
#' displayed. Use `rownames(se)` to display a list of valid events.
#' @param Gene Whether to use the range for the given `Gene`. If given,
#' overrides `Event` (but `Event` or `norm_event` will be used to normalise by
#' condition). Valid `Gene` entries include gene_id (Ensembl ID) or gene_name
#' (Gene Symbol).
#' @param seqname,start,end The chromosome (string) and genomic `start/end`
#' coordinates (numeric) of the region to display. If present, overrides both
#' `Event` and `Gene`. E.g. for a given region of chr1:10000-11000,
#' use the parameters: `seqname = "chr1", start = 10000, end = 11000`
#' @param coordinates A string specifying genomic coordinates can be given
#' instead of `seqname,start,end`. Must be of the format "chr:start-end", e.g.
#' "chr1:10000-11000"
#' @param strand Whether to show coverage of both strands "*" (default), or
#' from the "+" or "-" strand only.
#' @param zoom_factor Zoom out from event. Each level of zoom zooms out by a
#' factor of 3. E.g. for a query region of chr1:10000-11000, if a
#' `zoom_factor` of 1.0 is given, chr1:99000-12000 will be displayed.
#' @param tracks The names of individual samples,
#' or the names of the different conditions to be plotted. For the latter, set
#' `condition` to the specified condition category.
#' @param track_names The names of the tracks to be displayed. If omitted, the
#' track_names will default to the input in `tracks`
#' @param condition To display normalised coverage per condition, set this to
#' the condition category. If omitted, `tracks` are assumed to refer to the
#' names of individual samples.
#' @param selected_transcripts (Optional) A vector containing transcript ID
#' or transcript names of transcripts
#' to be displayed on the gene annotation track. Useful to remove minor
#' isoforms that are not relevant to the samples being displayed.
#' @param condense_tracks (default `FALSE`) Whether to collapse the
#' transcript track annotations by gene.
#' @param stack_tracks (default `FALSE`) Whether to graph all the conditions on
#' a single coverage track. If set to `TRUE`, each condition will be displayed
#' in a different colour on the same track. Ignored if `condition` is not set.
#' @param t_test (default `FALSE`) Whether to perform a pair-wise T-test.
#' Only used if there are TWO condition tracks.
#' @param norm_event Whether to normalise by an event different to that given
#' in "Event". The difference between this and Event is that the genomic
#' coordinates can be centered around a different `Event`, `Gene` or region
#' as given in `seqname/start/end`. If `norm_event` is different to
#' `Event`, `norm_event` will be used for normalisation and `Event` will be
#' used to define the genomic coordinates of the viewing window. `norm_event`
#' is required if `Event` is not set and `condition` is set.
#' @param bases_flanking (Default = `100`) How many bases flanking the zoomed
#' window. Useful when
#' used in conjunction with zoom_factor == 0. E.g. for a given region of
#' chr1:10000-11000, if `zoom_factor = 0` and `bases_flanking = 100`, the
#' region chr1:9900-11100 will be displayed.
#' @param p_obj In `as_egg_ggplot`, takes the output of `Plot_Coverage` and
#' plots all tracks in a static plot using `ggarrange` function of the
#' `egg` package. Requires `egg` to be installed.
#'
#' @return A list containing two objects. final_plot is the plotly object.
#' ggplot is a list of ggplot tracks, with:
#'
#' * `ggplot[[n]]` is the nth track (where n = 1, 2, 3 or 4).
#' * `ggplot[[5]]` contains the T-test track if one is generated.
#' * `ggplot[[6]]` always contains the genome track.
#' @examples
#' se <- NxtIRF_example_NxtSE()
#'
#' # Plot the genome track only, with specified gene:
#' p <- Plot_Genome(se, Gene = "SRSF3")
#' p$ggplot
#'
#' # View the genome track, specifying a genomic region via coordinates:
#' p <- Plot_Genome(se, coordinates = "chrZ:10000-20000")
#' p$ggplot
#'
#' # Assign annotation re experimental conditions
#'
#' colData(se)$treatment <- rep(c("A", "B"), each = 3)
#'
#' # Verify that the COV files are linked to the NxtSE object:
#' covfile(se)
#'
#' # Return a list of ggplot and plotly objects
#' p <- Plot_Coverage(
#' se = se,
#' Event = rowData(se)$EventName[1],
#' tracks = colnames(se)[1:4]
#' )
#'
#' # Display a static ggplot / egg::ggarrange stacked plot:
#'
#' as_egg_ggplot(p)
#'
#' # Display the plotly-based interactive Coverage plot:
#' p$final_plot
#'
#' # Plot the same event but by condition "treatment"
#' p <- Plot_Coverage(
#' se, rowData(se)$EventName[1],
#' tracks = c("A", "B"), condition = "treatment"
#' )
#' as_egg_ggplot(p)
#' @name Plot_Coverage
#' @md
NULL
#' @describeIn Plot_Coverage generates plots showing depth of alignments to
#' the genomic axis. Plots can be generated for individual samples or samples
#' grouped by experimental conditions. In the latter, mean and 95% confidence
#' intervals are shown.
#' @export
Plot_Coverage <- function(
se,
Event, Gene,
seqname, start, end, # Optional
coordinates,
strand = c("*", "+", "-"),
zoom_factor, bases_flanking = 100,
tracks,
track_names = tracks,
condition,
selected_transcripts,
condense_tracks = FALSE,
stack_tracks = FALSE,
t_test = FALSE,
norm_event
) {
if ((missing(seqname) | missing(start) | missing(end)) &
!missing(coordinates)) {
gr <- CoordToGR(coordinates)
seqname <- as.character(seqnames(gr))
start <- start(gr)
end <- end(gr)
}
# Validate given arguments
.plot_cov_validate_args(se, tracks, condition, Event, Gene,
seqname, start, end, bases_flanking)
cov_data <- ref(se)
strand <- match.arg(strand)
if (strand == "") strand <- "*"
# Work out viewing coordinates based on given request
coords <- .plot_coverage_determine_params(
se, Event, Gene,
seqname, start, end, zoom_factor, bases_flanking
)
if (missing(norm_event)) {
if (!missing(Event)) {
norm_event <- Event
} else {
norm_event <- ""
}
}
if (missing(condition)) condition <- ""
if (condition != "" & norm_event == "") {
.log(paste("If `condition` is set,",
"you must provide a valid `norm_event` or `Event`,",
"otherwise per-condition depth normalization cannot be performed"
))
}
args <- list(
view_chr = coords$view_chr, view_start = coords$view_start,
view_end = coords$view_end, view_strand = strand,
norm_event = norm_event, condition = condition,
tracks = as.list(tracks), track_names = track_names,
se = se, avail_files = covfile(se),
transcripts = cov_data$transcripts.DT, elems = cov_data$elem.DT,
stack_tracks = stack_tracks,
graph_mode = "Pan", conf.int = 0.95,
t_test = t_test, condensed = condense_tracks
)
if (norm_event != "")
args[["highlight_events"]] <-
.plot_coverage_highlight_events(se, norm_event)
if (!missing(selected_transcripts))
args[["selected_transcripts"]] <- selected_transcripts
p <- do.call(plot_cov_fn, args)
for(i in seq_len(length(p$ggplot) - 1)) {
if(!is.null(p$ggplot[[i]])) {
p$ggplot[[i]] <- p$ggplot[[i]] +
coord_cartesian(
xlim = c(coords$view_start, coords$view_end),
expand = FALSE)
}
}
return(p)
}
#' @describeIn Plot_Coverage Generates a plot of transcripts within a given
#' genomic region, or belonging to a specified gene
#' @export
Plot_Genome <- function(se, reference_path,
Gene, seqname, start, end, coordinates, zoom_factor, bases_flanking = 100,
selected_transcripts,
condense_tracks = FALSE
) {
if (missing(se) & missing(reference_path))
.log("Either one of `reference_path` or `se` is required")
if (missing(se)) {
if (!file.exists(file.path(reference_path, "cov_data.Rds")))
.log("Given reference_path is not a valid NxtIRF reference")
cov_data <- readRDS(file.path(reference_path, "cov_data.Rds"))
} else {
if (!is(se, "NxtSE")) .log("`se` must be a NxtSE object")
cov_data <- ref(se)
}
if ((missing(seqname) | missing(start) | missing(end)) &
!missing(coordinates)) {
gr <- CoordToGR(coordinates)
seqname <- as.character(seqnames(gr))
start <- BiocGenerics::start(gr)
end <- BiocGenerics::end(gr)
}
.plot_cov_validate_args_loci(cov_data,
Gene = Gene, seqname = seqname, start = start, end = end)
coords <- .plot_genome_determine_params(
cov_data, Gene, seqname, start, end, zoom_factor, bases_flanking
)
args <- list(
view_chr = coords$view_chr, view_start = coords$view_start,
view_end = coords$view_end,
transcripts = cov_data$transcripts.DT, elems = cov_data$elem.DT,
condensed = condense_tracks
)
if (!missing(selected_transcripts))
args$selected_transcripts <- selected_transcripts
p_ref <- do.call(plot_view_ref_fn, args)
# Remove legend for p_ref; this causes trouble for plotly
for (i in seq_len(length(p_ref$pl$x$data))) {
p_ref$pl$x$data[[i]]$showlegend <- FALSE
}
p_ref$gp <- p_ref$gp +
theme(legend.position = "none") +
labs(x = paste("Chromosome", coords$view_chr))
final <- list(ggplot = p_ref$gp,
final_plot = p_ref$pl)
return(final)
}
#' @describeIn Plot_Coverage Coerce the `Plot_Coverage()` output as a vertically
#' stacked ggplot, using egg::ggarrange
#' @export
as_egg_ggplot <- function(p_obj) {
.check_package_installed("egg")
if (
!("ggplot" %in% names(p_obj)) ||
!is(p_obj$ggplot[[1]], "ggplot") ||
!is(p_obj$ggplot[[6]], "ggplot")
) .log("Given object is not a recognised Plot_Coverage output object")
plot_tracks <- p_obj$ggplot[
unlist(lapply(p_obj$ggplot, function(x) !is.null(x)))]
egg::ggarrange(plots = plot_tracks, ncol = 1)
}
#' Calls NxtIRF's C++ function to retrieve coverage from a COV file
#'
#' This function returns an RLE / RLEList or data.frame
#' containing coverage data from the given COV file\cr\cr
#' COV files are generated by NxtIRF's [IRFinder] and [BAM2COV] functions.
#' It records alignment coverage for each nucleotide in the given BAM file.
#' It stores this data in "COV" format, which is an indexed BGZF-compressed
#' format specialised for the storage of unstranded and stranded alignment
#' coverage in RNA sequencing.\cr\cr
#' Unlike BigWig files, COV files store coverage for both positive and negative
#' strands.\cr\cr
#' These functions retrieves coverage data from the specified COV file. They
#' are computationally efficient as they utilise random-access to rapidly
#' search for the requested data from the COV file.\cr\cr
#'
#' @param file (Required) The file name of the COV file
#' @param seqname (Required for `GetCoverage_DF`) A string denoting the
#' chromosome name. If left blank in `GetCoverage`, retrieves RLEList
#' containing coverage of the entire file.
#' @param start,end 1-based genomic coordinates. If `start = 0` and
#' `end = 0`, will retrieve RLE of specified chromosome.
#' @param strand Either "*", "+", or "-"
#' @param strandMode The stranded-ness of the RNA-seq experiment. "unstranded"
#' means that an unstranded protocol was used. Stranded protocols can be
#' either "forward", where the first read is the same strand as the
#' expressed transcript, or "reverse" where the second strand is the same
#' strand as the expressed transcript.
#' @param regions A GRanges object for a set of regions to obtain mean / total
#' coverage from the given COV file.
#' @param region In `GetCoverageBins`, a single query region as a GRanges object
#' @param bins In `GetCoverageBins`, the number of bins to divide the given
#' `region`. If `bin_size` is given, overrides this parameter
#' @param bin_size In `GetCoverageBins`, the number of nucleotides per bin
#' @return
#' For `GetCoverage`: If seqname is left as "", returns an RLEList of the
#' whole BAM file, with each RLE in the list containing coverage data for
#' one chromosome. Otherwise, returns an RLE containing coverage data for
#' the requested genomic region
#'
#' For `GetCoverage_DF`: Returns a two-column data frame, with the first column
#' `coordinate` denoting genomic coordinate, and the second column `value`
#' containing the coverage depth for each coordinate nucleotide.
#'
#' For `GetCoverageRegions`: Returns a GRanges object with an extra metacolumn:
#' `cov_mean`, which gives the mean coverage of each of the given ranges.
#'
#' For `GetCoverageBins`: Returns a GRanges object which spans the given
#' `region`, divided by the number of `bins` or by width as given by
#' `bin_size`. Mean coverage in each bin is calculated (returned by the
#' `cov_mean` metadata column). This function is useful for retrieving
#' coverage of a large region for visualisation, especially when the
#' size of the region vastly exceeds the width of the figure.
#'
#' @examples
#' se <- NxtIRF_example_NxtSE()
#'
#' cov_file <- covfile(se)[1]
#'
#' # Retrieve Coverage as RLE
#'
#' cov <- GetCoverage(cov_file, seqname = "chrZ",
#' start = 10000, end = 20000,
#' strand = "*"
#' )
#'
#' # Retrieve Coverage as data.frame
#'
#' cov.df <- GetCoverage_DF(cov_file, seqname = "chrZ",
#' start = 10000, end = 20000,
#' strand = "*"
#' )
#'
#' # Retrieve mean coverage of 100-nt window regions as defined
#' # in a GRanges object:
#'
#' gr <- GenomicRanges::GRanges(
#' seqnames = "chrZ",
#' ranges = IRanges::IRanges(
#' start = seq(1, 99901, by = 100),
#' end = seq(100, 100000, by = 100)
#' ), strand = "-"
#' )
#'
#' gr.unstranded <- GetCoverageRegions(cov_file,
#' regions = gr,
#' strandMode = "unstranded"
#' )
#'
#' gr.stranded <- GetCoverageRegions(cov_file,
#' regions = gr,
#' strandMode = "reverse"
#' )
#'
#' # Retrieve binned coverage of a large region
#'
#' gr.fetch = GetCoverageBins(
#' cov_file,
#' region = GenomicRanges::GRanges(seqnames = "chrZ",
#' ranges = IRanges::IRanges(start = 100, end = 100000),
#' strand = "*"
#' ),
#' bins = 2000
#' )
#'
#' # Plot coverage using ggplot:
#'
#' require(ggplot2)
#'
#' ggplot(cov.df, aes(x = coordinate, y = value)) +
#' geom_line() + theme_white
#'
#' ggplot(as.data.frame(gr.unstranded),
#' aes(x = (start + end) / 2, y = cov_mean)) +
#' geom_line() + theme_white
#'
#' ggplot(as.data.frame(gr.fetch),
#' aes(x = (start + end)/2, y = cov_mean)) +
#' geom_line() + theme_white
#'
#' # Export COV data as BigWig
#'
#' cov_whole <- GetCoverage(cov_file)
#' bw_file <- file.path(tempdir(), "sample.bw")
#' rtracklayer::export(cov_whole, bw_file, "bw")
#' @name Coverage
#' @md
NULL
#' @describeIn Coverage Retrieves alignment coverage as an RLE or RLElist
#' @export
GetCoverage <- function(file, seqname = "", start = 0, end = 0,
strand = c("*", "+", "-")) {
strand <- match.arg(strand)
if (!(strand %in% c("*", "+", "-"))) {
.log(paste("In GetCoverage(),",
"Invalid strand. '*', '+' or '-'"))
}
strand_int <- ifelse(strand == "*", 2,
ifelse(strand == "+", 1, 0))
if (!is.numeric(start) || !is.numeric(end) ||
(as.numeric(start) > as.numeric(end) & as.numeric(end) > 0)) {
.log(paste("In GetCoverage(),",
"Zero or negative regions are not allowed"))
}
if (seqname == "") {
raw_list <- IRF_RLEList_From_Cov(normalizePath(file), strand_int)
final_list <- list()
if (length(raw_list) > 0) {
for (i in seq_len(length(raw_list))) {
final_list[[i]] <- S4Vectors::Rle(
raw_list[[i]]$values, raw_list[[i]]$lengths
)
}
} else {
return(NULL)
}
final_RLE <- as(final_list, "RleList")
names(final_RLE) <- names(raw_list)
return(final_RLE)
} else if (end == 0) {
raw_RLE <- IRF_RLE_From_Cov(
normalizePath(file), as.character(seqname),
0, 0, strand_int
)
final_RLE <- S4Vectors::Rle(raw_RLE$values, raw_RLE$lengths)
} else {
raw_RLE <- IRF_RLE_From_Cov(
normalizePath(file), as.character(seqname),
round(as.numeric(start)), round(as.numeric(end)),
strand_int
)
final_RLE <- S4Vectors::Rle(raw_RLE$values, raw_RLE$lengths)
}
}
.cov_process_regions <- function(file, gr, seq, strand_gr, strand_cov) {
# adds cov_mean from cov file to gr, only for given seqname seq
# strand_gr and strand_cov are matching strand info for gr and cov
if (!any(
as.character(seqnames(gr)) %in% seq &
as.character(strand(gr)) %in% strand_gr
)) {
return(gr)
}
todo <- which(
as.character(seqnames(gr)) %in% seq &
as.character(strand(gr)) %in% strand_gr
)
cov_data <- GetCoverage(
file, seq,
min(start(gr[todo])) - 1,
max(end(gr[todo])),
strand_cov
)
if (is.null(gr$cov_mean)) gr$cov_mean <- 0
gr$cov_mean[todo] <- round(
aggregate(
cov_data, IRanges(start(gr[todo]), end(gr[todo])), FUN = mean
), 2
)
return(gr)
}
#' @describeIn Coverage Retrieves alignment coverage as a data.frame
#' @export
GetCoverage_DF <- function(file, seqname = "", start = 0, end = 0,
strand = c("*", "+", "-")
) {
if (seqname == "") .log("seqname must not be omitted in GetCoverage_DF")
cov <- GetCoverage(file, seqname, start, end, strand)
view <- IRanges::Views(cov, start + 1, end)
view.df <- as.data.frame(view[[1]])
return(data.frame(
coordinate = seq(start + 1, end), value = view.df$value
))
}
#' @describeIn Coverage Retrieves total and mean coverage of a GRanges object
#' from a COV file
#' @export
GetCoverageRegions <- function(file, regions,
strandMode = c("unstranded", "forward", "reverse")
) {
strandMode <- match.arg(strandMode)
if (strandMode == "") strandMode <- "unstranded"
if (!is(regions, "GRanges")) .log("regions must be a GRanges object")
if (!IsCOV(file)) .log("Given file is not of COV format")
seqlevels <- IRF_Cov_Seqnames(normalizePath(file))
# trim regions by available seqlevels
if (!any(seqlevels %in% seqlevels(regions)))
.log("COV file and given regions have incompatible seqnames")
seqlevels(regions, pruning.mode = "coarse") <- seqlevels
if (length(regions) == 0) {
return(regions)
}
if (strandMode == "unstranded") {
for (seq in unique(seqnames(regions))) {
regions <- .cov_process_regions(file, regions, seq,
c("+", "-", "*"), "*")
}
} else if (strandMode == "forward") {
for (seq in unique(seqnames(regions))) {
regions <- .cov_process_regions(file, regions, seq, "*", "*")
regions <- .cov_process_regions(file, regions, seq, "+", "+")
regions <- .cov_process_regions(file, regions, seq, "-", "-")
}
} else {
for (seq in unique(seqnames(regions))) {
regions <- .cov_process_regions(file, regions, seq, "*", "*")
regions <- .cov_process_regions(file, regions, seq, "-", "+")
regions <- .cov_process_regions(file, regions, seq, "+", "-")
}
}
return(regions)
}
#' @describeIn Coverage Retrieves coverage of a single region from a COV file,
#' binned by the given number of bins or bin_size
#' @export
GetCoverageBins <- function(file, region, bins = 2000,
strandMode = c("unstranded", "forward", "reverse"),
bin_size
) {
strandMode <- match.arg(strandMode)
if (strandMode == "") strandMode <- "unstranded"
if (!is(region, "GRanges")) .log("region must be a GRanges object")
region <- region[1]
if (!IsCOV(file)) .log("Given file is not of COV format")
seqlevels <- IRF_Cov_Seqnames(normalizePath(file))
if(!(as.character(seqnames(region)) %in% seqlevels))
.log("Given region is on a chromosome that is missing in COV file")
if(!is.numeric(bins) || bins < 1) .log("`bins` must be a numeric value")
if(missing(bin_size) || !is.numeric(bin_size) ||
bin_size > width(region) || bin_size < 1) {
bin_size <- ceiling(width(region) / bins)
}
gr.fetch <- .bin_gr(region, bin_size)
if (strandMode == "unstranded") {
strand = "*"
} else if (strandMode == "reverse") {
if(strand(region) == "+") {
strand <- "-"
} else if(strand(region == "-")) {
strand <- "+"
} else {
strand <- "*"
}
} else {
strand <- as.character(strand(region))
}
df <- as.data.frame(.internal_get_coverage_as_df(
"sample", file,
as.character(seqnames(region)),
start(region), end(region), strand)
)
df <- bin_df(df, bin_size)
gr.fetch$cov_mean <- df$sample
return(gr.fetch)
}
.bin_gr <- function(gr, window_size) {
brks <- seq(1, width(gr) + 1, length.out = (width(gr) + 1) / window_size)
DT <- data.table(coord = seq(start(gr), end(gr)))
DT[, c("bin") := findInterval(seq_len(nrow(DT)), brks)]
DT2 <- DT[, .(start = min(get("coord")), end = max(get("coord"))),
by = "bin"]
DT2[, c("seqnames", "strand") :=
list(as.character(seqnames(gr)), as.character(strand(gr)))]
.grDT(DT2)
}
########## Internal functions ##########
# Validate given arguments in Plot_Coverage()
.plot_cov_validate_args <- function(se, tracks, condition,
Event, Gene,
seqname, start, end, bases_flanking
) {
.plot_cov_validate_args_se(se, tracks, condition)
cov_data <- ref(se)
checked <- .plot_cov_validate_args_loci(
cov_data, Event, Gene, seqname, start, end)
if (!checked) .plot_cov_validate_args_event(se, Event, bases_flanking)
}
# Check se, tracks, conditions
.plot_cov_validate_args_se <- function(se, tracks, condition) {
if (missing(se) || !is(se, "NxtSE"))
.log("In Plot_Coverage, `se` must be a valid `NxtSE` object")
if (!all(file.exists(covfile(se))))
.log(paste("In Plot_Coverage,",
"COV files are not defined in se.",
"Please supply the correct paths of the COV files",
"using covfile(se) <- vector_of_correct_COVfile_paths"))
# Check condition and tracks
if (length(tracks) < 1 | length(tracks) > 4)
.log(paste("In Plot_Coverage,", "tracks must be of length 1-4"))
if (!missing(condition)) {
if (length(condition) != 1)
.log(paste("In Plot_Coverage,", "condition must be of length 1"))
if (!(condition %in% names(colData(se))))
.log(paste("In Plot_Coverage,",
"condition must be a valid column name in colData(se)"))
condition_options <- unique(colData(se)[, condition])
if (!all(tracks %in% condition_options))
.log(paste("In Plot_Coverage,",
"some tracks do not match valid condition names in",
args[["condition"]]))
} else {
if (!all(tracks %in% colnames(se)))
.log(paste("In Plot_Coverage,",
"some tracks do not match valid sample names in se"))
}
}
# Checks Gene and loci. Only this is run if Plot_Genome is run
.plot_cov_validate_args_loci <- function(cov_data,
Event, Gene, seqname, start, end, bases_flanking = 0
) {
if (!all(c("seqInfo", "gene_list", "elem.DT", "transcripts.DT") %in%
names(cov_data)))
.log(paste("In Plot_Coverage,",
"cov_data must be a valid object",
"created by prepare_covplot_data()"))
# Check we know where to plot
if (missing(Event) & missing(Gene) &
(missing(seqname) | missing(start) | missing(end))
) {
.log(paste("In Plot_Coverage,",
"Event or Gene cannot be empty, unless coordinates are provided"))
} else if ((is_valid(seqname) & is_valid(start) & is_valid(end))) {
view_chr <- as.character(seqname)
view_start <- start
view_end <- end
} else if (is_valid(Gene)) {
if (!(Gene %in% cov_data$gene_list$gene_id) &
!(Gene %in% cov_data$gene_list$gene_name)) {
.log(paste("In Plot_Coverage,",
Gene, "is not a valid gene symbol or Ensembl gene id"))
}
if (!(Gene %in% cov_data$gene_list$gene_id)) {
gene.df <- as.data.frame(
cov_data$gene_list[get("gene_name") == get("Gene")])
if (nrow(gene.df) != 1) {
.log(paste("In Plot_Coverage,", Gene,
"is an ambiguous name referring to 2 or more genes.",
"Please provide its gene_id instead"))
}
} else {
gene.df <- as.data.frame(
cov_data$gene_list[get("gene_id") == get("Gene")])
}
view_chr <- as.character(gene.df$seqnames)
view_start <- gene.df$start
view_end <- gene.df$end
} else {
return(FALSE)
}
view_center <- (view_start + view_end) / 2
view_length <- view_end - view_start
if (!(view_chr %in% names(cov_data$seqInfo)))
.log(paste("In Plot_Coverage,", view_chr,
"is not a valid chromosome reference name in the given genome"))
if (is_valid(bases_flanking) &&
(!is.numeric(bases_flanking) || bases_flanking < 0))
.log(paste("In Plot_Coverage,",
"bases_flanking must be a non-negative number"))
if (!is.numeric(view_length) || view_length < 0)
.log(paste("In Plot_Coverage,",
"view_length must be a non-negative number"))
return(TRUE)
}
# Checks whether Event given is valid.
.plot_cov_validate_args_event <- function(se, Event, bases_flanking) {
cov_data <- ref(se)
rowData <- as.data.frame(rowData(se))
if (!(Event %in% rownames(rowData))) {
.log(paste("In Plot_Coverage,", Event,
"is not a valid IR or alternate splicing event in rowData(se)"))
}
rowData <- rowData[Event, ]
view_chr <- tstrsplit(rowData$EventRegion, split = ":")[[1]]
temp1 <- tstrsplit(rowData$EventRegion, split = "/")
temp2 <- tstrsplit(temp1[[1]], split = ":")[[2]]
view_start <- as.numeric(tstrsplit(temp2, split = "-")[[1]])
view_end <- as.numeric(tstrsplit(temp2, split = "-")[[2]])
view_center <- (view_start + view_end) / 2
view_length <- view_end - view_start
if (!(view_chr %in% names(cov_data$seqInfo)))
.log(paste("In Plot_Coverage,", view_chr,
"is not a valid chromosome reference name in the given genome"))
if (is_valid(bases_flanking) &&
(!is.numeric(bases_flanking) || bases_flanking < 0))
.log(paste("In Plot_Coverage,",
"bases_flanking must be a non-negative number"))
if (!is.numeric(view_length) || view_length < 0)
.log(paste("In Plot_Coverage,",
"view_length must be a non-negative number"))
return(TRUE)
}
# Work out viewing coordinates based on given request
.plot_coverage_determine_params <- function(
se, Event, Gene,
seqname, start, end, zoom_factor, bases_flanking
) {
cov_data <- ref(se)
if (!missing(zoom_factor)) {
tryCatch({
zoom_factor <- as.numeric(zoom_factor)
}, error = function(e) {
zoom_factor <- NULL
})
}
# Prepare zoom window
if (!missing(seqname) & !missing(start) & !missing(end)) {
view_chr <- as.character(seqname)
view_start <- start
view_end <- end
if (!is_valid(zoom_factor)) zoom_factor <- 0
} else if (!missing(Gene)) {
if (Gene %in% cov_data$gene_list$gene_id) {
gene.df <- as.data.frame(
cov_data$gene_list[get("gene_id") == get("Gene")])
} else {
gene.df <- as.data.frame(
cov_data$gene_list[get("gene_name") == get("Gene")])
}
view_chr <- as.character(gene.df$seqnames)
view_start <- gene.df$start
view_end <- gene.df$end
if (!is_valid(zoom_factor)) zoom_factor <- 0
} else {
rowData <- as.data.frame(rowData(se))
rowData <- rowData[Event, ]
view_chr <- tstrsplit(rowData$EventRegion, split = ":")[[1]]
temp1 <- tstrsplit(rowData$EventRegion, split = "/")
temp2 <- tstrsplit(temp1[[1]], split = ":")[[2]]
view_start <- as.numeric(tstrsplit(temp2, split = "-")[[1]])
view_end <- as.numeric(tstrsplit(temp2, split = "-")[[2]])
if (!is_valid(zoom_factor)) zoom_factor <- 1
}
if (zoom_factor < 0) zoom_factor <- 0
# Apply zoom
view_center <- (view_start + view_end) / 2
view_length <- view_end - view_start
new_view_length <- view_length * 3^zoom_factor + 2 * bases_flanking
view_start <- round(view_center - new_view_length / 2)
view_end <- round(view_center + new_view_length / 2)
# Validate genomic window and shift if invalid
if (view_start < 1) view_start <- 1
seqInfo <- cov_data$seqInfo[view_chr]
seqmax <- GenomeInfoDb::seqlengths(seqInfo)
if (view_end > seqmax) view_end <- seqmax - 1
return(list(
view_chr = view_chr, view_start = view_start, view_end = view_end
))
}
.plot_genome_determine_params <- function(
cov_data, Gene, seqname, start, end, zoom_factor, bases_flanking
) {
if (!missing(zoom_factor)) {
tryCatch({
zoom_factor <- as.numeric(zoom_factor)
}, error = function(e) {
zoom_factor <- NULL
})
}
if (!missing(seqname) & !missing(start) & !missing(end)) {
view_chr <- as.character(seqname)
view_start <- start
view_end <- end
if (!is_valid(zoom_factor)) zoom_factor <- 0
} else if (!missing(Gene)) {
if (Gene %in% cov_data$gene_list$gene_id) {
gene.df <- as.data.frame(
cov_data$gene_list[get("gene_id") == get("Gene")])
} else {
gene.df <- as.data.frame(
cov_data$gene_list[get("gene_name") == get("Gene")])
}
view_chr <- as.character(gene.df$seqnames)
view_start <- gene.df$start
view_end <- gene.df$end
if (!is_valid(zoom_factor)) zoom_factor <- 0
} else {
.log("Either coordinates or gene name should be given")
}
if (zoom_factor < 0) zoom_factor <- 0
# Apply zoom
view_center <- (view_start + view_end) / 2
view_length <- view_end - view_start
new_view_length <- view_length * 3^zoom_factor + 2 * bases_flanking
view_start <- round(view_center - new_view_length / 2)
view_end <- round(view_center + new_view_length / 2)
# Validate genomic window and shift if invalid
if (view_start < 1) view_start <- 1
seqInfo <- cov_data$seqInfo[view_chr]
seqmax <- GenomeInfoDb::seqlengths(seqInfo)
if (view_end > seqmax) view_end <- seqmax - 1
return(list(
view_chr = view_chr, view_start = view_start, view_end = view_end
))
}
# Determines what events to highlight given `norm_event`
.plot_coverage_highlight_events <- function(se, norm_event) {
events_to_highlight <- list()
rowData <- as.data.frame(rowData(se))
if (rowData$EventType[match(norm_event, rowData$EventName)]
%in% c("MXE", "SE")) {
events_to_highlight[[1]] <- c(
rowData$Event1a[match(norm_event, rowData$EventName)],
rowData$Event2a[match(norm_event, rowData$EventName)])
} else {
events_to_highlight[[1]] <- rowData$Event1a[
match(norm_event, rowData$EventName)]
}
if (rowData$EventType[match(norm_event, rowData$EventName)]
%in% c("MXE")) {
events_to_highlight[[2]] <- c(
rowData$Event1b[match(norm_event, rowData$EventName)],
rowData$Event2b[match(norm_event, rowData$EventName)])
} else if (rowData$EventType[match(norm_event, rowData$EventName)]
%in% c("SE", "A3SS", "A5SS", "ALE", "AFE")) {
events_to_highlight[[2]] <- rowData$Event1b[
match(norm_event, rowData$EventName)]
}
return(events_to_highlight)
}
# Internal function used to plot everything
plot_cov_fn <- function(
view_chr, view_start, view_end, view_strand,
norm_event, condition, tracks = list(), track_names = NULL, se, avail_files,
transcripts, elems, highlight_events = "", selected_transcripts = "",
stack_tracks, graph_mode, conf.int = 0.95,
t_test = FALSE, condensed = FALSE
) {
args <- as.list(match.call())
if (is.null(track_names)) args$track_names <- unlist(tracks)
p_ref <- plot_view_ref_fn(
view_chr, view_start, view_end,
transcripts, elems, highlight_events,
condensed = condensed,
selected_transcripts = selected_transcripts
)
data.t_test <- list()
cur_zoom <- floor(log((view_end - view_start) / 50) / log(3))
if (is_valid(condition) & is_valid(norm_event)) {
# Calculate normalized values given `condition` and `norm_event`
calcs <- do.call(.plot_cov_fn_normalize_condition, args)
if (stack_tracks == TRUE) {
plot_objs <- .plot_cov_fn_plot_by_condition_stacked(calcs, args)
} else {
plot_objs <- .plot_cov_fn_plot_by_condition_unstacked(calcs, args)
}
if (t_test) plot_objs <- .plot_cov_fn_ttest(plot_objs, calcs)
} else if (!is_valid(condition)) {
# Plot individual coverages on separate tracks
plot_objs <- do.call(.plot_cov_fn_indiv, args)
}
# Summarize non-null tracks
plot_tracks <- plot_objs$pl_track[
unlist(lapply(plot_objs$pl_track, function(x) !is.null(x)))]
# Remove legend for p_ref; this causes trouble for plotly
for (i in seq_len(length(p_ref$pl$x$data))) {
p_ref$pl$x$data[[i]]$showlegend <- FALSE
}
# Put the reference track on the final position
plot_tracks[[length(plot_tracks) + 1]] <- p_ref$pl
# Put the reference track in position #6 of ggplot list
plot_objs$gp_track[[6]] <- p_ref$gp +
theme(legend.position = "none") +
labs(x = paste("Chromosome", view_chr))
# Combine multiple tracks into a plotly plot
final_plot <- plot_cov_fn_finalize(
plot_tracks, view_start, view_end, graph_mode)
return(list(ggplot = plot_objs$gp_track, final_plot = final_plot))
}
############################### PLOT GENOME TRACK ##############################
# Plots the transcript track, highlighting where required
plot_view_ref_fn <- function(
view_chr, view_start, view_end,
transcripts, elems, highlight_events = "",
condensed = FALSE, selected_transcripts = ""
) {
DTlist <- .plot_view_ref_fn_getDTlist(
view_chr, view_start, view_end,
transcripts, elems, highlight_events,
condensed = condensed, selected_transcripts
)
DTplotlist <- .plot_view_ref_fn_groupDTlist(DTlist,
view_chr, view_start, view_end, highlight_events)
return(.plot_view_ref_fn_plotDTlist(DTplotlist,
view_chr, view_start, view_end, highlight_events))
}
.plot_view_ref_fn_getDTlist <- function(
view_chr, view_start, view_end,
transcripts, elems, highlight_events = "",
condensed = FALSE, selected_transcripts = ""
) {
transcripts.DT <- transcripts[
get("seqnames") == view_chr &
get("start") <= view_end + (view_end - view_start) &
get("end") >= view_start - (view_end - view_start)
]
setorderv(transcripts.DT, c("transcript_support_level", "width"))
if (length(selected_transcripts) > 1 || selected_transcripts != "") {
transcripts.DT <- transcripts.DT[
get("transcript_id") %in% selected_transcripts |
get("transcript_name") %in% selected_transcripts
]
} # filter transcripts if applicable
screen.DT <- elems[
get("transcript_id") %in% transcripts.DT$transcript_id &
get("type") %in% c("CDS", "start_codon", "stop_codon", "exon")
]
if (condensed != TRUE & nrow(transcripts.DT) <= 100) {
condense_this <- FALSE
transcripts.DT[, c("group_id") := get("transcript_id")]
screen.DT[, c("group_id") := get("transcript_id")]
} else {
condense_this <- TRUE
transcripts.DT[, c("group_id") := get("gene_id")]
screen.DT[transcripts.DT, on = "transcript_id",
c("group_id") := get("gene_id")]
}
reduced.DT <- copy(screen.DT)
reduced.DT[get("type") %in% c("CDS", "start_codon", "stop_codon"),
c("type") := "CDS"]
reduced.DT[get("type") != "CDS", c("type") := "exon"]
introns.DT <- as.data.table(.grlGaps(
split(.grDT(reduced.DT), reduced.DT$transcript_id)))
introns.DT[, c("type") := "intron"]
setnames(introns.DT, "group_name", "transcript_id")
introns.DT[reduced.DT, on = "transcript_id",
"group_id" := get("i.group_id")]
filter_cols <- c("seqnames", "start", "end", "strand",
"type", "group_id", "transcript_id")
reduced.DT <- rbind(reduced.DT[, filter_cols, with = FALSE],
introns.DT[, filter_cols, with = FALSE])
# Highlight events here
# highlight_events is of syntax chrX:10000-11000/-
if (length(highlight_events) > 1 || highlight_events != "")
reduced.DT <- determine_compatible_events(reduced.DT, highlight_events)
return(list(
transcripts.DT = transcripts.DT,
reduced.DT = reduced.DT,
condense_this = condense_this
))
}
determine_compatible_events <- function(reduced.DT, highlight_events) {
introns <- reduced.DT[get("type") == "intron"]
introns[, c("highlight") := "0"]
exons <- reduced.DT[get("type") == "exon"]
exons[, c("highlight") := "0"]
misc <- reduced.DT[get("type") == "CDS"]
misc[, c("highlight") := "0"]
tr_filter <- c()
if (length(highlight_events) == 1) {
gr <- CoordToGR(highlight_events[[1]])
introns.gr <- .grDT(introns)
OL <- findOverlaps(gr, introns.gr)
introns[OL@to, c("highlight") := 1]
OL2 <- findOverlaps(gr, introns.gr, type = "equal")
introns[OL2@to, c("highlight") := 2]
} else if (length(highlight_events) == 2) {
AS_count <- 1
for (event in highlight_events) {
gr <- CoordToGR(event)
introns.gr <- .grDT(introns)
OL <- findOverlaps(gr, introns.gr, type = "equal")
introns[OL@to, c("highlight") := as.character(AS_count)]
OL_s1 <- findOverlaps(gr[1], introns.gr, type = "equal")
tr1 <- unique(introns$transcript_id[OL_s1@to])
if (length(gr) == 2) {
OL_s2 <- findOverlaps(gr[2], introns.gr, type = "equal")
tr1 <- unique(intersect(tr1, introns$transcript_id[OL_s2@to]))
}
tr_filter <- c(tr_filter, tr1)
coord_keys <- c(start(gr[1]) - 1, end(gr[1]) + 1)
if (length(gr) == 2) {
coord_keys <- c(coord_keys,
start(gr[2]) - 1, end(gr[2]) + 1)
}
exons[get("transcript_id") %in% tr1 &
(get("start") %in% coord_keys | get("end") %in% coord_keys),
c("highlight") := as.character(AS_count)]
AS_count <- AS_count + 1
}
}
return(rbind(introns, exons, misc))
}
.plot_view_ref_fn_groupDTlist <- function(DTlist,
view_chr, view_start, view_end, highlight_events = ""
) {
transcripts.DT <- DTlist$transcripts.DT
reduced.DT <- DTlist$reduced.DT
condense_this <- DTlist$condense_this
group.grl <- split(.grDT(transcripts.DT), transcripts.DT$group_id)
group.DT <- as.data.table(range(group.grl))
group.DT$group <- NULL
data.table::setnames(group.DT, "group_name", "group_id")
# apply plot_order on transcripts.DT
OL <- findOverlaps(.grDT(group.DT), .grDT(group.DT), ignore.strand = TRUE)
group.DT$plot_level <- 1
cur_level <- 1
while (any(group.DT$plot_level == cur_level)) {
j <- match(cur_level, group.DT$plot_level)
repeat {
bump_up_trs <- unique(OL@to[OL@from == j])
bump_up_trs <- bump_up_trs[bump_up_trs > j]
bump_up_trs <- bump_up_trs[
group.DT$plot_level[bump_up_trs] == cur_level]
if (length(bump_up_trs) > 0)
group.DT[bump_up_trs, c("plot_level") := cur_level + 1]
j <- j + match(cur_level, group.DT$plot_level[-seq_len(j)])
if (is.na(j)) break
}
cur_level <- cur_level + 1
}
if (condense_this == TRUE) {
group.DT[transcripts.DT, on = "group_id",
c("group_name", "group_biotype") :=
list(get("i.gene_name"), get("i.gene_biotype"))]
} else {
group.DT[transcripts.DT, on = "group_id",
c("group_name", "group_biotype") :=
list(get("i.transcript_name"), get("i.transcript_biotype"))]
}
group.DT <- group.DT[get("end") > view_start & get("start") < view_end]
group.DT[get("strand") == "+", c("display_name") :=
paste(get("group_name"), "-", get("group_biotype"), " ->>")]
group.DT[get("strand") == "-", c("display_name") :=
paste("<-- ", get("group_name"), "-", get("group_biotype"))]
group.DT[, c("disp_x") := 0.5 * (get("start") + get("end"))]
group.DT[get("start") < view_start & get("end") > view_start,
c("disp_x") := 0.5 * (view_start + get("end"))]
group.DT[get("end") > view_end & get("start") < view_end,
c("disp_x") := 0.5 * (get("start") + view_end)]
group.DT[get("start") < view_start & get("end") > view_end,
c("disp_x") := 0.5 * (view_start + view_end)]
reduced.DT$group_id <- factor(reduced.DT$group_id,
unique(group.DT$group_id), ordered = TRUE)
reduced.DT[group.DT, on = "group_id",
c("plot_level") := get("i.plot_level")]
if (length(highlight_events) == 1 && highlight_events == "") {
reduced.DT[, c("highlight") := FALSE]
} else {
setorderv(reduced.DT, "highlight")
}
return(list(
group.DT = group.DT,
reduced.DT = reduced.DT,
condense_this = condense_this
))
}
.plot_view_ref_fn_plotDTlist <- function(DTplotlist,
view_chr, view_start, view_end, highlight_events
) {
group.DT <- DTplotlist$group.DT
reduced <- as.data.frame(DTplotlist$reduced.DT)
reduced <- reduced[!is.na(reduced$plot_level), ]
condense_this <- DTplotlist$condense_this
p <- ggplot(reduced)
if (nrow(subset(reduced, type = "intron")) > 0) {
p <- p + geom_segment(data = reduced[reduced$type == "intron", ],
aes(x = get("start"), xend = get("end"),
y = get("plot_level"), yend = get("plot_level"),
color = get("highlight")))
}
if (nrow(reduced[reduced$type != "intron", ]) > 0) {
p <- p + geom_rect(data = reduced[reduced$type != "intron", ],
aes(xmin = get("start"), xmax = get("end"),
ymin = get("plot_level") - 0.1 -
ifelse(get("type") %in%
c("CDS", "start_codon", "stop_codon"), 0.1, 0),
ymax = get("plot_level") + 0.1 +
ifelse(get("type") %in%
c("CDS", "start_codon", "stop_codon"), 0.1, 0),
fill = get("highlight")
)
)
}
if (length(highlight_events) > 1 || highlight_events != "") {
p <- p + scale_color_manual(values = c("black", "blue", "red")) +
scale_fill_manual(values = c("black", "blue", "red"))
}
p <- p + theme_white_legend_plot_track +
theme(axis.text.y = element_blank(), axis.title.y = element_blank(),
legend.title = element_blank())
if (condense_this == TRUE) {
anno <- list(
x = group.DT$disp_x,
y = group.DT$plot_level - 0.5 + 0.3 * runif(rep(1, nrow(group.DT))),
text = group.DT$display_name,
xref = "x", yref = "y", showarrow = FALSE)
} else {
anno <- list(
x = group.DT$disp_x,
y = group.DT$plot_level - 0.4,
text = group.DT$display_name,
xref = "x", yref = "y", showarrow = FALSE)
}
if (nrow(group.DT) == 0) {
max_plot_level <- 1
} else {
max_plot_level <- max(group.DT$plot_level)
}
gp <- p + geom_text(data = data.frame(x = anno[["x"]], y = anno[["y"]],
text = anno[["text"]]),
aes(x = get("x"), y = get("y"), label = get("text")))
gp <- gp + coord_cartesian(xlim = c(view_start, view_end),
ylim = c(min(reduced$plot_level) - 2, max(reduced$plot_level)) + 0.5,
expand = FALSE)
pl <- ggplotly(p, tooltip = "text") %>%
layout(
annotations = anno, dragmode = "pan",
xaxis = list(range = c(view_start, view_end),
title = paste("Chromosome/Scaffold", view_chr)),
yaxis = list(range = c(0, 1 + max_plot_level),
fixedrange = TRUE)
)
return(list(gp = gp, pl = pl))
}
################################# PLOT TRACKS ##################################
# Calculations to normalise samples by condition
.plot_cov_fn_normalize_condition <- function(
view_chr, view_start, view_end, view_strand,
norm_event, condition, tracks = list(), track_names = "", se, avail_files,
conf.int = 0.95, t_test = FALSE, ...
) {
cur_zoom <- floor(log((view_end - view_start) / 50) / log(3))
depth_min <- 10 # depth required for sample to be included in averages
data.list <- list()
data.t_test <- fac <- NULL
max_tracks <- 0
for (i in seq_len(4)) {
if (length(tracks) >= i && is_valid(tracks[[i]])) {
track_samples <- tracks[[i]]
colData <- colData(se)
samples <- rownames(colData)[
unlist(as.character(colData[, condition])
== track_samples)]
event_norms <- assay(se, "Depth")[norm_event, samples]
samples <- samples[event_norms >= depth_min]
event_norms <- event_norms[event_norms >= depth_min]
if (length(avail_files[samples]) > 0 &&
all(file.exists(avail_files[samples]))) {
df <- as.data.frame(.internal_get_coverage_as_df(
samples, avail_files[samples],
view_chr, view_start, view_end, view_strand))
# bin anything with cur_zoom > 4
df <- bin_df(df, max(1, 3^(cur_zoom - 4)))
# message(paste("Group GetCoverage performed for", condition))
for (todo in seq_len(length(samples))) {
df[, samples[todo]] <-
df[, samples[todo]] / event_norms[todo]
}
if (t_test == TRUE) {
if (is.null(data.t_test)) {
data.t_test <- as.matrix(df)
fac <- rep(as.character(i), ncol(df) - 1)
} else {
data.t_test <- cbind(
data.t_test, as.matrix(df[, -1]))
fac <- c(fac, rep(as.character(i), ncol(df) - 1))
}
}
df$mean <- rowMeans(as.matrix(df[, samples]))
df$sd <- rowSds(as.matrix(df[, samples]))
n <- length(samples)
df$ci <- qt((1 + conf.int) / 2, df = n - 1) * df$sd / sqrt(n)
if (length(track_names) == length(tracks)) {
df$track <- track_names[i]
} else {
df$track <- as.character(i)
}
DT <- as.data.table(df)
DT <- DT[, c("x", "mean", "ci", "track")]
data.list[[i]] <- DT
max_tracks <- max_tracks + 1
}
}
}
return(list(
data.list = data.list, data.t_test = data.t_test,
fac = fac, max_tracks = max_tracks
))
}
# Plot all conditions into one track
.plot_cov_fn_plot_by_condition_stacked <- function(calcs, args) {
max_tracks <- calcs$max_tracks
gp_track <- pl_track <- list()
df <- as.data.frame(rbindlist(calcs$data.list))
if (nrow(df) > 0) {
if (length(args$track_names) == length(args$tracks))
df$track <- factor(df$track, args$track_names)
gp_track[[1]] <- ggplot() +
geom_hline(yintercept = 0) +
geom_ribbon(data = df, alpha = 0.2,
aes(x = get("x"), y = get("mean"),
ymin = get("mean") - get("ci"),
ymax = get("mean") + get("ci"),
fill = get("track"))) +
geom_line(data = df, aes(x = get("x"),
y = get("mean"), colour = get("track"))) +
labs(y = "Normalized Coverage") +
theme_white_legend +
theme(legend.title = element_blank())
pl_track[[1]] <- ggplotly(gp_track[[1]],
tooltip = c("x", "y", "ymin", "ymax", "colour")
)
pl_track[[1]] <- pl_track[[1]] %>% layout(
dragmode = "zoom",
yaxis = list(rangemode = "tozero", fixedrange = TRUE)
)
for (j in seq_len(max_tracks)) {
pl_track[[1]]$x$data[[1 + j]]$showlegend <- FALSE
pl_track[[1]]$x$data[[1 + j + max_tracks]]$showlegend <- TRUE
if (length(args$track_names) >= max_tracks) {
pl_track[[1]]$x$data[[1 + j]]$name <- args$track_names[j]
pl_track[[1]]$x$data[[1 + j + max_tracks]]$name <-
args$track_names[j]
} else {
pl_track[[1]]$x$data[[1 + j]]$name <-
paste(args$condition, args$tracks[[j]])
pl_track[[1]]$x$data[[1 + j + max_tracks]]$name <-
paste(args$condition, args$tracks[[j]])
}
}
# remove x axis label, rename y axis
gp_track[[1]] <- gp_track[[1]] + theme(axis.title.x = element_blank()) +
labs(x = "", y = "Normalized Coverage")
}
return(list(
gp_track = gp_track, pl_track = pl_track
))
}
# Plot each condition into separate track
.plot_cov_fn_plot_by_condition_unstacked <- function(calcs, args) {
max_tracks <- calcs$max_tracks
gp_track <- pl_track <- list()
for (i in seq_len(4)) {
if (length(calcs$data.list) >= i && !is.null(calcs$data.list[[i]])) {
df <- as.data.frame(calcs$data.list[[i]])
gp_track[[i]] <- ggplot() +
geom_hline(yintercept = 0) +
geom_ribbon(data = df, alpha = 0.2, colour = NA,
aes(x = get("x"), y = get("mean"),
ymin = get("mean") - get("ci"),
ymax = get("mean") + get("ci"))) +
geom_line(data = df,
aes(x = get("x"), y = get("mean"))) +
labs(y = paste(args$condition, args$tracks[[i]])) +
theme_white_legend
pl_track[[i]] <- ggplotly(gp_track[[i]],
tooltip = c("x", "y", "ymin", "ymax")
)
pl_track[[i]] <- pl_track[[i]] %>% layout(
yaxis = list(rangemode = "tozero", fixedrange = TRUE)
)
pl_track[[i]]$x$data[[2]]$showlegend <- FALSE
pl_track[[i]]$x$data[[3]]$showlegend <- FALSE
if (length(args$track_names) >= i) {
track_name <- args$track_names[i]
} else {
track_name <- paste(args$condition, args$tracks[[i]])
}
pl_track[[i]]$x$data[[2]]$name <- track_name
pl_track[[i]]$x$data[[3]]$name <- track_name
gp_track[[i]] <- gp_track[[i]] +
theme(axis.title.x = element_blank()) +
labs(x = "", y = track_name)
}
}
return(list(
gp_track = gp_track, pl_track = pl_track
))
}
# Plot t-test track
.plot_cov_fn_ttest <- function(
plot_objs, calcs
) {
# Plot t-test track
fac <- NULL
if ("fac" %in% names(calcs)) fac <- calcs$fac
if (!is.null(fac) && length(unique(fac)) == 2) {
fac <- factor(fac)
t_test <- genefilter::rowttests(
as.matrix(calcs$data.t_test[, -1]), fac)
DT <- data.table(x = calcs$data.t_test[, 1])
DT[, c("t_stat") := -log10(t_test$p.value)]
plot_objs$gp_track[[5]] <- ggplot() +
geom_hline(yintercept = 0) +
geom_line(data = as.data.frame(DT),
mapping = aes(x = get("x"), y = get("t_stat"))) +
theme_white_legend
plot_objs$pl_track[[5]] <- ggplotly(plot_objs$gp_track[[5]],
tooltip = c("x", "y")
)
plot_objs$pl_track[[5]] <- plot_objs$pl_track[[5]] %>% layout(
yaxis = list(
rangemode = "tozero",
title = paste("T-test -log10(p)")
)
)
plot_objs$pl_track[[5]]$x$data[[2]]$showlegend <- FALSE
plot_objs$gp_track[[5]] <- plot_objs$gp_track[[5]] +
theme(axis.title.x = element_blank()) +
labs(y = "log10 t-test")
}
return(plot_objs)
}
# Plot individual samples one in each track
.plot_cov_fn_indiv <- function(
view_chr, view_start, view_end, view_strand,
tracks = list(), track_names = NULL, avail_files, ...
) {
cur_zoom <- floor(log((view_end - view_start) / 50) / log(3))
gp_track <- pl_track <- list()
data.list <- list()
for (i in seq_len(4)) {
if (length(tracks) >= i && is_valid(tracks[[i]])) {
track_samples <- tracks[[i]]
filename <- avail_files[which(
names(avail_files) == track_samples)]
if (length(filename) == 1 && file.exists(filename)) {
df <- .internal_get_coverage_as_df("sample", filename,
view_chr, view_start, view_end, view_strand)
df <- bin_df(df, max(1, 3^(cur_zoom - 4)))
data.list[[i]] <- as.data.table(df)
if ("sample" %in% colnames(df)) {
gp_track[[i]] <- ggplot() +
geom_hline(yintercept = 0) +
geom_line(data = df,
aes(x = get("x"), y = get("sample"))) +
theme_white_legend
pl_track[[i]] <- ggplotly(gp_track[[i]],
tooltip = c("x", "y")
)
pl_track[[i]] <- pl_track[[i]] %>% layout(
yaxis = list(
range = c(0, 1 + max(unlist(df[, "sample"]))),
fixedrange = TRUE,
title = paste(track_samples, "")
)
)
pl_track[[i]]$x$data[[2]]$showlegend <- FALSE
if (!missing(track_names) && length(track_names) >= i) {
track_name <- track_names[i]
} else {
track_name <- track_samples
}
pl_track[[i]]$x$data[[2]]$name <- track_name
gp_track[[i]] <- gp_track[[i]] +
theme(axis.title.x = element_blank()) +
labs(x = "", y = track_name)
}
}
}
}
return(list(
gp_track = gp_track, pl_track = pl_track
))
}
# Combine multiple tracks into a plotly plot
plot_cov_fn_finalize <- function(
plot_tracks, view_start, view_end, graph_mode
) {
# Work out which x axis ticks to use, based on zoom level
view_range <- view_end - view_start
min_tick_size <- view_range / 15
tick_order_magn <- 10^floor(log10(min_tick_size))
# round up tick size to nearest 1, 2, 5
if (min_tick_size / tick_order_magn > 5) {
tick_size <- tick_order_magn * 20
} else if (min_tick_size / tick_order_magn > 2) {
tick_size <- tick_order_magn * 10
} else {
tick_size <- tick_order_magn * 5
}
first_tick <- ceiling(view_start / tick_size) * tick_size
final_plot <- subplot(plot_tracks, nrows = length(plot_tracks),
shareX = TRUE, titleY = TRUE) %>%
layout(
xaxis = list(
dtick = tick_size,
tick0 = first_tick,
tickmode = "linear"
)
)
if (graph_mode == "Pan") {
final_plot <- final_plot %>%
layout(dragmode = "pan")
} else if (graph_mode == "Zoom") {
final_plot <- final_plot %>%
layout(dragmode = "zoom")
} else if (graph_mode == "Movable Labels") {
final_plot <- final_plot %>%
layout(dragmode = FALSE) %>%
config(editable = TRUE)
}
# Zero all but last subplot
n_plots <- length(plot_tracks) - 1
final_plot <- final_plot %>% layout(yaxis = list(
rangemode = "tozero", tick0 = 0))
if (n_plots > 1)
final_plot <- final_plot %>% layout(yaxis2 =
list(rangemode = "tozero", tick0 = 0))
if (n_plots > 2)
final_plot <- final_plot %>% layout(yaxis3 =
list(rangemode = "tozero", tick0 = 0))
if (n_plots > 3)
final_plot <- final_plot %>% layout(yaxis4 =
list(rangemode = "tozero", tick0 = 0))
if (n_plots > 4)
final_plot <- final_plot %>% layout(yaxis5 =
list(rangemode = "tozero", tick0 = 0))
return(final_plot)
}
################################# PLOT TRACKS INTERNALS ########################
.internal_get_coverage_as_df <- function(samples, files, seqname, start, end,
strand = c("*", "+", "-")) {
strand <- match.arg(strand)
if (!(strand %in% c("*", "+", "-"))) {
.log(paste("In GetCoverage(),",
"Invalid strand. '*', '+' or '-'"))
}
covData <- list()
for (i in seq_len(length(files))) {
cov <- GetCoverage(files[i], seqname, start - 1, end, strand)
view <- IRanges::Views(cov, start, end)
view.df <- as.data.frame(view[[1]])
covData[[i]] <- view.df
}
df <- do.call(cbind, covData)
colnames(df) <- samples
x <- seq(start, end)
df <- cbind(x, df)
return(df)
}
bin_df <- function(df, binwidth = 3) {
DT <- as.data.table(df)
brks <- seq(1, nrow(DT) + 1, length.out = (nrow(DT) + 1) / binwidth)
bin <- NULL
DT[, c("bin") := findInterval(seq_len(nrow(DT)), brks)]
DT2 <- DT[, lapply(.SD, mean, na.rm = TRUE), by = "bin"]
DT2[, c("bin") := NULL]
return(as.data.frame(DT2))
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.