## Retrieve Sequences Near TSSs
##
## Retrieve sequences surrounding TSSs for further plotting.
##
## @inheritParams common_params
## @param distance Bases to add on each side of each TSS.
.tss_sequences <- function(
experiment,
samples="all",
genome_assembly=NULL,
threshold=NULL,
use_normalized=FALSE,
distance=10,
dominant=FALSE,
data_conditions=NULL
) {
## Check inputs.
assert_that(is(experiment, "tsr_explorer"))
assert_that(is.character(samples))
assert_that(
is.null(genome_assembly) || is.character(genome_assembly) ||
is(genome_assembly, "BSgenome")
)
assert_that(is.null(threshold) || (is.numeric(threshold) && threshold >= 0))
assert_that(is.count(distance))
assert_that(is.flag(dominant))
assert_that(is.null(data_conditions) || is.list(data_conditions))
## Open genome assembly.
genome_assembly <- .prepare_assembly(genome_assembly, experiment)
## Get selected samples.
select_samples <- experiment %>%
extract_counts("tss", samples, use_normalized) %>%
preliminary_filter(dominant, threshold)
## Apply data conditions.
select_samples <- condition_data(select_samples, data_conditions)
## Prepare table for sequence retrieval.
select_samples <- rbindlist(select_samples, idcol="sample")
select_samples[, tss := start]
select_samples <- as_granges(select_samples)
## Add chromosome lengths to GRanges.
assembly_type <- case_when(
is(genome_assembly, "BSgenome") ~ "bsgenome",
is(genome_assembly, "FaFile") ~ "fafile"
)
chrm_lengths <- switch(
assembly_type,
"fafile"=Rsamtools::seqinfo(genome_assembly),
"bsgenome"=GenomeInfoDb::seqinfo(genome_assembly)
)
chrm_lengths <- chrm_lengths[seqlevels(select_samples)]
seqlengths(select_samples) <- seqlengths(chrm_lengths)
## Expand GRanges and remove out of bound.
select_samples <- stretch(select_samples, distance * 2)
out_of_bounds <- .out_of_bounds_index(select_samples)
if (length(out_of_bounds) > 0) {
select_samples <- select_samples[-out_of_bounds]
}
## Retrieve sequences.
seqs <- switch(
assembly_type,
"bsgenome"=BSgenome::getSeq(genome_assembly, select_samples),
"fafile"=Rsamtools::getSeq(genome_assembly, select_samples)
)
seqs <- seqs %>%
as.data.table %>%
{cbind(as.data.table(select_samples), .)}
setnames(seqs, old="x", new="sequence")
## Order samples if required.
if (!all(samples == "all")) {
seqs[, sample := factor(sample, levels=samples)]
}
return(seqs)
}
#' Generate Sequence Logo
#'
#' @description
#' Create a sequence logo for the sequences around TSSs.
#'
#' @importFrom Biostrings consensusMatrix
#'
#' @inheritParams common_params
#' @param distance Bases to add on each side of each TSS.
#' @param font_size Font size for plots.
#' @param base_colors Colors for each base.
#' @param ... Arguments passed to ggseqlogo.
#'
#' @details
#' This plotting function uses the ggseqlogo library to make sequence logos
#' from the sequences retrieved by the 'tss_sequences' function.
#' Sequence logos illustrate positional biases for certain bases at specific
#' positions in a set of centered sequences. This is particularly important for
#' TSS analysis since literature has shown strong base preferences spanning TSSs
#' and surrounding sequences.
#'
#' 'genome_assembly' must be a valid genome assembly in either fasta or BSgenome format.
#' fasta formatted genome assemblies should have the file extension '.fasta' or '.fa'.
#' BSgenome assemblies are precompiled Bioconductor libraries for common organisms.
#'
#' 'distance' controls the length upstream and downstream of the TSS
#' for which the sequence will be retrieved.
#'
#' The color of each base is set using the 'base_colors' argument.
#' The argument input should be a named vector, with the base as the name
#' and the desired color of the base as the vector element.
#'
#' A set of functions to control data structure for plotting are included.
#' 'threshold' will define the minimum number of reads a TSS or TSR
#' must have to be considered.
#' 'dominant' specifies whether only the dominant TSS or TSR is considered
#' from the 'mark_dominant' function.
#' For TSSs this can be either dominant per TSR or gene, and for TSRs
#' it is just the dominant TSR per gene.
#' 'data_conditions' allows for the advanced filtering, ordering, and grouping
#' of data.
#'
#' @return ggplot2 object with sequence logo.
#'
#' @seealso
#' \code{\link{plot_sequence_colormap}} for a sequence color map plot.
#'
#' @examples
#' data(TSSs_reduced)
#' assembly <- system.file("extdata", "S288C_Assembly.fasta", package="TSRexploreR")
#'
#' exp <- TSSs_reduced %>%
#' tsr_explorer(genome_assembly=assembly) %>%
#' format_counts(data_type="tss")
#'
#' p <- plot_sequence_logo(exp, distance=5)
#'
#' @export
plot_sequence_logo <- function(
experiment,
samples="all",
genome_assembly=NULL,
threshold=NULL,
use_normalized=FALSE,
distance=10,
dominant=FALSE,
data_conditions=NULL,
ncol=1,
font_size=6,
base_colors=c(
A="#109649", C="#255C99",
G="#F7B32C", T="#D62839"
),
...
) {
## Check if ggseqlogo and cowplot are installed.
if (!requireNamespace("ggseqlogo", quietly = TRUE)) {
stop("Package \"ggseqlogo\" needed for this function to work. Please install it.",
call. = FALSE)
}
if (!requireNamespace("cowplot", quietly = TRUE)) {
stop("Package \"cowplot\" needed for this function to work. Please install it.",
call. = FALSE)
}
## Check inputs.
assert_that(is(experiment, "tsr_explorer"))
assert_that(is.character(samples))
assert_that(
is.null(genome_assembly) || is.character(genome_assembly) ||
is(genome_assembly, "BSgenome")
)
assert_that(is.null(threshold) || (is.numeric(threshold) && threshold >= 0))
assert_that(is.count(distance))
assert_that(is.flag(dominant))
assert_that(is.null(data_conditions) || is.list(data_conditions))
assert_that(is.count(ncol))
assert_that(is.numeric(font_size) && font_size > 0)
assert_that(
is.character(base_colors) &&
all(c("A", "T", "G", "C") %in% names(base_colors))
)
## Get sequences.
tss_sequences <- .tss_sequences(
experiment,
samples,
genome_assembly,
threshold,
use_normalized,
distance,
dominant,
data_conditions
)
## Store status of data conditions.
grouping_status <- case_when(
!is.null(data_conditions$quantiling) ~ "row_quantile",
!is.null(data_conditions$grouping) ~ "row_groups",
TRUE ~ "none"
)
## Grab sequences from input.
if (grouping_status == "none") {
sequences <- tss_sequences %>%
split(.$sample) %>%
map(function(x) {x[["sequence"]]})
} else {
setnames(tss_sequences, old=grouping_status, new="grouping")
sequences <- tss_sequences %>%
as.data.table %>%
split(.$grouping) %>%
map(function(x) {
split(x, x$sample) %>%
map(function(y) {y[["sequence"]]})
})
}
## Create viridis color scheme for bases.
viridis_bases <- ggseqlogo::make_col_scheme(
chars=c("A", "C", "G", "T"),
groups=c("A", "C", "G", "T"),
cols=base_colors[match(
c("A", "C", "G", "T"),
names(base_colors)
)]
)
## Make sequence logo.
if (grouping_status == "none") {
p <- ggseqlogo::ggseqlogo(sequences, ncol=ncol, seq_type="dna", ...) +
theme(text=element_text(size=font_size))
} else {
p <- sequences %>%
map(function(x) {
ggseqlogo::ggseqlogo(x, ncol=ncol, seq_type="dna", ...) +
theme(text=element_text(size=font_size))
})
p <- cowplot::plot_grid(plotlist=p, labels=rev(names(sequences)), ncol=1)
}
return(p)
}
#' Plot Sequence Color Map
#'
#' Make a color map for the sequences around TSSs.
#'
#' @importFrom dplyr bind_cols
#' @importFrom stringr str_length
#'
#' @inheritParams common_params
#' @param base_colors Named vector specifying colors for each base.
#' @param distance Bases to add on each side of each TSS.
#' @param font_size Size of text for plots.
#' @param ... Arguments passed to geom_tile.
#'
#' @details
#' This plotting function generates a ggplot2 base color map for the sequences
#' around TSSs. Color maps represent each base surrounding a TSS as a different color.
#' Since the base composition for every TSS region can be seen in one plot, it's a good
#' companion for sequence logos.
#'
#' The color of each base is set using the 'base_colors' argument.
#' The argument input should be a named vector, with the base as the name,
#' and the desired color of the base as the vector element.
#'
#' 'genome_assembly' must be a valid genome assembly in either fasta or BSgenome format.
#' fasta formatted genome assemblies should have the file extension '.fasta' or '.fa'.
#' BSgenome assemblies are precompiled Bioconductor libraries for common organisms.
#'
#' 'distance' controls the length upstream and downstream of the TSS
#' from which the sequence will be retrieved.
#'
#' A set of functions to control data structure for plotting are included.
#' 'threshold' will define the minimum number of reads a TSS or TSR
#' must have to be considered.
#' 'dominant' specifies whether only the dominant TSS or TSR is considered
#' from the 'mark_dominant' function.
#' For TSSs this can be either dominant per TSR or gene, and for TSRs
#' it is just the dominant TSR per gene.
#' 'data_conditions' allows for the advanced filtering, ordering, and grouping
#' of data.
#'
#' The plot can be rasterized using ggrastr using 'rasterize',
#' and the rasterization DPI set using 'raster_dpi'.
#'
#' @return ggplot2 object of sequence colormap.
#'
#' @seealso
#' \code{\link{plot_sequence_logo}} to plot a sequence logo.
#'
#' @examples
#' data(TSSs_reduced)
#' assembly <- system.file("extdata", "S288C_Assembly.fasta", package="TSRexploreR")
#'
#' exp <- TSSs_reduced %>%
#' tsr_explorer(genome_assembly=assembly) %>%
#' format_counts(data_type="tss")
#'
#' p <- plot_sequence_colormap(exp, distance=5)
#'
#' @export
plot_sequence_colormap <- function(
experiment,
samples="all",
genome_assembly=NULL,
threshold=NULL,
use_normalized=FALSE,
distance=10,
dominant=FALSE,
data_conditions=NULL,
ncol=1,
base_colors=c(
"A"="#109649", "C"="#255C99",
"G"="#F7B32C", "T"="#D62839"
),
font_size=6,
rasterize=FALSE,
raster_dpi=150,
...
) {
## Check inputs.
assert_that(is(experiment, "tsr_explorer"))
assert_that(is.character(samples))
assert_that(
is.null(genome_assembly) || is.character(genome_assembly) ||
is(genome_assembly, "BSgenome")
)
assert_that(is.null(threshold) || (is.numeric(threshold) && threshold >= 0))
assert_that(is.count(distance))
assert_that(is.flag(dominant))
assert_that(is.null(data_conditions) || is.list(data_conditions))
assert_that(is.count(ncol))
assert_that(is.numeric(font_size) && font_size > 0)
assert_that(
is.character(base_colors) &&
all(c("A", "T", "G", "C") %in% names(base_colors))
)
assert_that(is.flag(rasterize))
assert_that(is.count(raster_dpi))
## Check if ggrastr is installed if rasterization set.
if (rasterize) {
if (!requireNamespace("ggrastr", quietly = TRUE)) {
stop("Package \"ggrastr\" needed for this function to work. Please install it.",
call. = FALSE)
}
}
## Get sequences.
tss_sequences <- .tss_sequences(
experiment,
samples,
genome_assembly,
threshold,
use_normalized,
distance,
dominant,
data_conditions
)
## Store status of data conditions.
grouping_status <- case_when(
!is.null(data_conditions$quantiling) ~ "row_quantile",
!is.null(data_conditions$grouping) ~ "row_groups",
TRUE ~ "none"
)
## Split sequences into columns
split_seqs <- tss_sequences[, tstrsplit(sequence, split="")]
setnames(
split_seqs,
old=sprintf("V%s", seq(1, (distance * 2) + 1)),
new=as.character(c(seq(-distance, -1), seq(1, distance + 1)))
)
seq_data <- cbind(tss_sequences, split_seqs)
## Get order of TSSs for plotting.
if (!is.null(data_conditions$ordering)) {
seq_data[, FHASH := fct_reorder(FHASH, row_order)]
}
## Format data for plotting.
long_data <- melt(
seq_data,
measure.vars=as.character(c(seq(-distance, -1), seq(1, distance + 1))),
variable.name="position", value.name="base"
)
long_data[,
c("position", "base") := list(
position=as.numeric(position),
base=factor(base, levels=c("A", "C", "G", "T"))
)
]
## Set sample order if required.
if (!all(samples == "all")) {
long_data[, sample := factor(sample, levels=samples)]
}
## Plot sequence colormap
p <- ggplot(long_data, aes(x=.data$position, y=.data$FHASH))
if (rasterize) {
p <- p + ggrastr::rasterize(
geom_tile(aes(fill=.data$base, color=.data$base), ...),
dpi=raster_dpi
)
} else {
p <- p + geom_tile(aes(fill=.data$base, color=.data$base), ...)
}
p <- p +
scale_fill_manual(values=base_colors) +
scale_color_manual(values=base_colors) +
theme_minimal() +
theme(
axis.title.y=element_blank(),
axis.text.y=element_blank(),
legend.title=element_blank(),
axis.title.x=element_text(margin=margin(t=15)),
panel.grid=element_blank(),
text=element_text(size=font_size)
) +
scale_x_continuous(
breaks=c(1, distance, distance + 1, (distance * 2) + 1),
labels=c(-distance, -1, 1, distance + 1)
)
if (grouping_status == "none") {
p <- p + facet_wrap(. ~ sample, scales="free", ncol=ncol)
} else {
setnames(long_data, old=grouping_status, new="grouping")
p <- p + facet_wrap(grouping ~ sample, scales="free", ncol=ncol)
}
return(p)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.