#' Make sequence region heatmap relative to scoring
#'
#' Given sequences, DNA or RNA.
#' And some score, ribo-seq fpkm, TE etc.
#' Create a heatmap divided per letter in seqs, by how strong the score is.
#'
#' It will create blocks around the highest rate per position
#' @param seqs the sequences (character vector, DNAStringSet)
#' @param rate a scoring vector (equal size to seqs)
#' @param start position in seqs to start at (first is 1), default 1.
#' @param stop position in seqs to stop at (first is 1),
#' default max(nchar(seqs)), that is the longest sequence length
#' @param center position in seqs to center at (first is 1), center will
#' be +1 in heatmap
#' @param min.observations How many observations per position per letter
#' to accept? numeric or quantile, default (">q1", bigger than quartile 1
#' (25 percentile)). You can do (10), to get all with more than
#' 10 observations.
#' @param skip.startCodon startCodon is defined as after centering
#' (position 1, 2 and 3).
#' Should they be skipped ? default (FALSE).
#' Not relevant if you are not doing Translation initiation sites (TIS).
#' @param xlab Region you are checking, default (TIS)
#' @param type What type is the rate scoring ? default (ribo-seq)
#' @return a ggplot of the heatmap
#' @importFrom data.table melt
#' @export
#' @examples
#'
#' \dontrun{
#' if (requireNamespace("BSgenome.Hsapiens.UCSC.hg19")) {
#' txdbFile <- system.file("extdata", "hg19_knownGene_sample.sqlite",
#' package = "GenomicFeatures")
#' #Extract sequences of Coding sequences.
#' cds <- loadRegion(txdbFile, "cds")
#' tx <- loadRegion(txdbFile, "mrna")
#'
#' # Get region to check
#' kozakRegions <- startRegionString(cds, tx, BSgenome.Hsapiens.UCSC.hg19::Hsapiens
#' , upstream = 4, 5)
#' # Some toy ribo-seq fpkm scores on cds
#' set.seed(3)
#' fpkm <- sample(1:115, length(cds), replace = TRUE)
#' kozakHeatmap(kozakRegions, fpkm, 1, 9, skip.startCodon = F)
#' }
#' }
#'
kozakHeatmap <- function(seqs, rate, start = 1, stop = max(nchar(seqs)),
center = ceiling((stop - start + 1)/2),
min.observations = ">q1", skip.startCodon = FALSE,
xlab = "TIS", type = "ribo-seq") {
if (length(seqs) != length(rate)) stop("Length of rate and seq must be equal!")
if (length(seqs) == 0 | length(rate) == 0) stop("Length of rate and seq must be > 0!")
if (start > stop) stop("Stop must be bigger or equal to start")
if (is.null(names(seqs))) names(seqs) <- as.character(seq(1, length(seqs)))
# Update to only full length seqs
hits <- nchar(seqs) >= stop
seqs <- seqs[hits]; rate <- rate[hits]
if (!all(hits)) { # inform that not all seqs are full length
message("Not all sequences can be used, not full length")
print(summary(hits))
} else if (all(!hits)) stop("No seqs long enough for interval start, stop!")
dt <- data.table(X.gene_id = names(seqs))
vars <- c()
for (i in seq(start, stop)) {
dt[,paste0("seq", i)] <- substring(seqs, i, i)
vars <- c(vars, paste0("seq", i))
}
dt$rate <- rate
dt.melt <- melt(dt, id.vars = c("X.gene_id", "rate"))
codon.table <- dt.melt[, .(median_score = median(rate, na.rm = TRUE),
count_seq_pos_with_count = .N),
by = .(variable, value)]
uniques <- rev(unique(dt.melt$value))
codon.table$value <- factor(codon.table$value, uniques)
xPos <- seq(start, stop + 1) - start + 1 - center
xPos <- xPos[-center]
xPos[xPos > 0] <- paste0("+", xPos[xPos > 0])
codon.table$variable <- factor(codon.table$variable, levels = vars,
labels = xPos)
if (skip.startCodon) {
codon.table[codon.table$variable %in% c("+1", "+2", "+3"), ]$median_score <- NA
}
#output position where we have more than 1st quartile observations etc
print("Distribution of observations per position per letter")
print(summary(codon.table$count_seq_pos_with_count))
if (min.observations == ">q1") {
quart <- summary(codon.table$count_seq_pos_with_count)[2]
} else if (is.numeric(min.observations)) {
quart <- min.observations
} else stop("min.observations must be >q1 or a numeric!")
print(paste0("Picking: >", quart, " observations"))
codon.table.filtered <- codon.table[count_seq_pos_with_count > quart,]
if (nrow(codon.table.filtered) == 0) stop("No rows passed min.observations!")
plot_matrix2_log <- ggplot(data=codon.table.filtered,
aes(x=variable, y=value, fill=log2(median_score))) +
theme(panel.background=element_rect(fill="lightgrey", colour="lightgrey"),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank()) +
geom_tile(color = "lightgrey") +
scale_fill_gradientn(colors = c("blue", "white", "red"), na.value = 'lightgrey',
name = paste0("log2(median ", type,")")) +
xlab(paste0("Position relative to ", xlab)) +
ylab("Nucleotide")
rows <- seq(from = length(unique(codon.table.filtered$value)) - 0.5, to = 0.5)
names(rows) <- unique(codon.table.filtered$value)
xmin <- -0.5
# Per position/column (set the black max box)
for(col in unique(codon.table.filtered$variable)) {
mat <- codon.table.filtered[codon.table.filtered$variable == col,]
# Bug here, if not all letters are remaining
highest <- rows[names(rows) == mat$value[which.max(mat$median_score)]]
xmin = xmin + 1;xmax = xmin + 1; ymin = highest;ymax = ymin + 1
if (length(highest) == 0) next
input <- paste0("geom_rect(aes(xmin=",xmin,",xmax=",xmax,",ymin=",
ymin,",ymax=",ymax,"), color='black', size=0.5, fill=NA)")
plot_matrix2_log <- plot_matrix2_log +
eval(parse(text = input))
}
return(plot_matrix2_log)
}
#' Rank kozak initiation sequences
#'
#' Defined as region (-4, -1) relative to TIS
#' @param cds_k cds ranges (GRangesList)
#' @param mrna mrna ranges (GRangesList)
#' @param dt.ir data.table with a column called IR, initiation rate
#' @param group.min numeric, default 10. Minimum transcripts per initation
#' group to be included
#' @inheritParams kozakSequenceScore
#' @return a ggplot grid object
#' @export
kozak_IR_ranking <- function(cds_k, mrna, dt.ir, faFile, group.min = 10,
species = "human") {
faFile <- findFa(faFile)
seqs <- startRegionString(cds_k, tx = mrna, faFile = faFile, upstream = 4, downstream = -1)
dt.ir$seqs <- seqs
kozak_scores <- kozakSequenceScore(cds_k, mrna, faFile, species = species)
dt.ir$upstream_kozak_strength <- kozak_scores
merged_rates <- dt.ir[, .(mean_IR = mean(IR),
upstream_kozak_strength = mean(upstream_kozak_strength),
count = .N)
, by = seqs]
print("Distribution of observations group")
print(summary(merged_rates$count))
message(paste("Originally", nrow(merged_rates), "groups of sequences"))
merged_rates <- merged_rates[ count > group.min,]
message(paste("Filter down to", nrow(merged_rates), "groups of sequences"))
merged_rates$seqs <- factor(merged_rates$seqs, levels = merged_rates[order(mean_IR),]$seqs)
scales_kozak <- c(ceiling(min(merged_rates$upstream_kozak_strength * 10)) / 10)
scales_kozak <- c(scales_kozak, scales_kozak + 0.1)
mean_IR_20 <- ggplot(data=merged_rates, aes(x=seqs, y=1, fill=log2(mean_IR))) +
geom_tile() +
scale_fill_gradientn(colors = c("blue", "white", "red1", "red2", "red3")) +
theme_light() +
theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
labs(fill = "IR") +
xlab("Initiation sequence") +
theme(axis.title.x=element_blank(),
axis.text.x=element_blank(),
axis.ticks.x=element_blank(),
axis.title.y=element_blank(),
axis.text.y=element_blank(),
axis.ticks.y=element_blank()) +
theme(legend.key.size = unit(0.4, "cm"))
mean_IR_20
tile_KS_20 <- ggplot(data=merged_rates, aes(x=seqs, y=1, fill=upstream_kozak_strength)) +
geom_tile() +
# scale_fill_gradientn(colors = c("#146DA8", "white", "#A87225")) +
scale_fill_gradientn(colors = c("blue", "white", "red"), breaks = scales_kozak) +
theme_light() +
theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
labs(fill = "Kozak") +
xlab("Initiation sequence") +
theme(axis.title.y=element_blank(),
axis.text.y=element_blank(),
axis.ticks.y=element_blank()) +
theme(legend.key.size = unit(0.4, "cm"))
tile_KS_20
ranking <- gridExtra::grid.arrange(mean_IR_20, tile_KS_20, ncol = 1)
return(ranking)
}
#' TOP Motif ecdf plot
#'
#' Given sequences, DNA or RNA.
#' And some score, scanning efficiency (SE), ribo-seq fpkm, TE etc.
#'
#' Top motif defined as a TSS of C and 4 T's or C's (pyrimidins) downstream
#' of TSS C.
#'
#' The right plot groups:
#' C nucleotide, TOP motif (C, then 4 pyrimidines) and
#' OTHER (all other TSS variants).
#' @param seqs the sequences (character vector, DNAStringSet),
#' of 5' UTRs (leaders). See example below for input.
#' @param rate a scoring vector (equal size to seqs)
#' @param start position in seqs to start at (first is 1), default 1.
#' @param stop position in seqs to stop at (first is 1),
#' default max(nchar(seqs)), that is the longest sequence length
#' @param xlim What interval of rate values you want to show
#' type: numeric or quantile of length 2,
#' 1. default c("q10","q99"). bigger than 10 percentile
#' and less than 99 percentile.
#' 2. Set to numeric values, like c(5, 1000),
#' 3. Set to NULL if you want all values. Backend uses coord_cartesian.
#' @param type What type is the rate scoring ? default ("Scanning efficiency")
#' @param legend.position.1st adjust left plot label position, default c(0.75, 0.28),
#' ("none", "left", "right", "bottom", "top", or two-element numeric vector)
#' @param legend.position.motif adjust right plot label position, default c(0.75, 0.28),
#' ("none", "left", "right", "bottom", "top", or two-element numeric vector)
#' @return a ggplot gtable of the TOP motifs in 2 plots
#' @importFrom cowplot plot_grid
#' @importFrom gridExtra grid.arrange
#' @export
#' @examples
#'
#' \dontrun{
#' if (requireNamespace("BSgenome.Hsapiens.UCSC.hg19")) {
#' txdbFile <- system.file("extdata", "hg19_knownGene_sample.sqlite",
#' package = "GenomicFeatures")
#' #Extract sequences of Coding sequences.
#' leaders <- loadRegion(txdbFile, "leaders")
#'
#' # Should update by CAGE if not already done
#' cageData <- system.file("extdata", "cage-seq-heart.bed.bgz",
#' package = "ORFik")
#' leadersCage <- reassignTSSbyCage(leaders, cageData)
#' # Get region to check
#' seqs <- startRegionString(leadersCage, NULL,
#' BSgenome.Hsapiens.UCSC.hg19::Hsapiens, 0, 4)
#' # Some toy ribo-seq fpkm scores on cds
#' set.seed(3)
#' fpkm <- sample(1:115, length(leadersCage), replace = TRUE)
#' # Standard arguments
#' TOP.Motif.ecdf(seqs, fpkm, type = "ribo-seq FPKM",
#' legend.position.1st = "bottom",
#' legend.position.motif = "bottom")
#' # with no zoom on x-axis:
#' TOP.Motif.ecdf(seqs, fpkm, xlim = NULL,
#' legend.position.1st = "bottom",
#' legend.position.motif = "bottom")
#' }
#' }
#'
TOP.Motif.ecdf <- function(seqs, rate, start = 1, stop = max(nchar(seqs)),
xlim = c("q10","q99"),
type = "Scanning efficiency",
legend.position.1st = c(0.75, 0.28),
legend.position.motif = c(0.75, 0.28)) {
if (length(seqs) != length(rate)) stop("Length of rate and seq must be equal!")
if (length(seqs) == 0 | length(rate) == 0) stop("Length of rate and seq must be > 0!")
if (start > stop) stop("Stop must be bigger or equal to start")
# Update to only full length seqs
hits <- nchar(seqs) >= stop
seqs <- seqs[hits]; rate <- rate[hits]
if (!all(hits)) { # inform that not all seqs are full length
message("Not all sequences can be used, here is how many valid (TRUE):")
print(summary(hits))
} else if (all(!hits)) stop("No seqs long enough for interval start, stop!")
dt <- topMotif(seqs, start, stop)
message("Distribution of TOP motifs:")
print(table(dt$TOP))
dt$rate <- rate
# Define plot settings:
new_pallet_1 <- c("#E495A5","#ABB065","#39BEB1","#ACA4E2")
new_pallet_2 <- c("#39BEB1", "#ACA4E2", "#E495A5")
tl <- theme(legend.position = legend.position.1st, legend.background=element_blank())
tlb <- theme(legend.position = legend.position.motif, legend.background=element_blank(),
axis.ticks.y=element_blank(), axis.text.y = element_blank())
tit <- labs(color = "1st nucleotide")
titb <- labs(color = "Motif")
se1 <- ggplot(data=dt, aes((rate), colour = seq1)) +
stat_ecdf() +
scale_x_log10() +
scale_color_manual(values=new_pallet_1) +
ylab("") +
xlab("") +
theme_bw() +
tl + tit + theme(plot.margin = unit(c(0.1,0.1,0.1,0.1), "cm"))
se2 <- ggplot(data=dt, aes((rate), colour = TOP)) +
stat_ecdf() +
scale_x_log10() +
scale_color_manual(values=new_pallet_2) +
ylab("") +
xlab("") +
theme_bw() +
tlb + titb + theme(plot.margin = unit(c(0.1, 1, 0.1, 0.1), "cm"))
if (!is.null(xlim)) {
if (length(xlim) != 2) stop("xlim must be length 2 if defined!")
if (is.character(xlim)) {
xlim <- as.numeric(gsub("q", "", xlim)) / 100
if (any(xlim > 1 | xlim < 0)) stop("when xlim is percentiles,",
"range must be 0 to 100")
xlim <- quantile(dt$rate, xlim)
}
if (!is.numeric(xlim)) stop("xlim must be valid character or numeric!")
se1 <- se1 + coord_cartesian(xlim = xlim)
se2 <- se2 + coord_cartesian(xlim = xlim)
}
comb <- plot_grid(se1, se2, nrow = 1, align = "v")
comb <- grid.arrange(comb, bottom = paste0("log10(", type, ")"))
return(comb)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.