#' TSS Shifting
#'
#' @description
#' Analyze TSS shifts between samples within a consensus TSR set.
#'
#' @inheritParams common_params
#' @param sample_1 First sample to compare.
#' Vector with sample names for TSSs and TSRs,
#' with names 'TSS' and 'TSR'.
#' @param sample_2 Second sample to compare.
#' Vector with sample name for TSSs and TSRs,
#' with names 'TSS' and 'TSR'.
#' @param comparison_name Name assigned to the results in the TSRexploreR object.
#' @param tss_threshold Minimum number of raw counts required at a TSS for it to
#' be considered in the shifting analysis.
#' @param max_distance TSRs less than this distance apart will be merged.
#' @param min_threshold Minimum number of raw counts required in each TSR for both samples.
#' @param n_resamples Number of resamplings for permutation test.
#'
#' @details
#' This function assesses the difference between TSS distributions from two distinct samples
#' in a set of consensus TSRs by calculating a signed version of the earth mover's distance (EMD)
#' that we term earth mover's score (EMS). For this approach, we imagine that the two TSS
#' distributions in questions are piles of dirt, and ask how much dirt from one pile we would
#' need to move, how far, and in which direction, to mimic the distribution of the other sample.
#' The resulting EMS is between -1 and 1, with larger magnitudes indicating larger shifts and
#' the sign indicating direction (negative values indicate upstream shifts and positive values
#' indicate downstream shifts). The positive and negative components of the EMS are also reported.
#' Lastly, the function calculates a p-value for the null hypothesis that there is no difference
#' between the two samples using a permutation test and an FDR-corrected p-value calculated by the
#' Benjamini-Hochberg procedure.
#'
#' This function also calculates unsigned EMD, which indicates how much TSS "mass" has been moved
#' between two distributions without regard to direction. This is useful in the detection of "balanced"
#' shifts, wherein approximately equal mass is moved upstream and downstream. Examples of this are
#' TSR splitting or merging and a change in TSR shape (e.g., peaked to broad). These will generally
#' be marked by a low EMS, balanced positive and negative scores, and a high EMD. As for EMS, raw and
#' FDR-corrected p-values are reported for EMD.
#'
#' 'sample_1' and 'sample_2' should be the names of the two samples to compare. 'sample_1' should
#' be the control and 'sample_2' the treatment sample.
#' The results will be stored back in the TSRexploreR object with the name given by
#' 'comparison_name'.
#' 'tss_threshold' applies a global threshold to remove TSSs below a certain score,
#' and 'min_threshold' is the minimal score that both TSRs must have to be considered.
#' 'max_distance' is the maximum distance between two two TSRs to be considered for shifting.
#'
#' @return TSRexploreR object with shifting scores added.
#'
#' @examples
#' data(TSSs)
#' assembly <- system.file("extdata", "S288C_Assembly.fasta", package = "TSRexploreR")
#' samples <- data.frame(
#' sample_name=c(sprintf("S288C_D_%s", seq_len(3)), sprintf("S288C_WT_%s", seq_len(3))),
#' file_1=rep(NA, 6), file_2=rep(NA, 6),
#' condition=c(rep("Diamide", 3), rep("Untreated", 3))
#' )
#'
#' exp <- TSSs %>%
#' tsr_explorer(sample_sheet=samples, genome_assembly=assembly) %>%
#' format_counts(data_type="tss") %>%
#' tss_clustering(threshold=3) %>%
#' merge_samples(data_type = "tss", merge_group="condition") %>%
#' merge_samples(data_type = "tsr", merge_group="condition")
#'
#' exp <- tss_shift(
#' exp,
#' sample_1=c(TSS="S288C_WT_1", TSR="S288C_WT_1"),
#' sample_2=c(TSS="S288C_D_1", TSR="S288C_D_1"),
#' comparison_name="Untreated_vs_Diamide",
#' max_distance = 100, min_threshold = 10, n_resamples = 1000L
#' )
#'
#' @export
tss_shift <- function(
experiment,
sample_1,
sample_2,
comparison_name,
tss_threshold=NULL,
max_distance=100,
min_threshold=10,
n_resamples=1000L,
fdr_cutoff=0.05
){
## Input checks.
assert_that(is(experiment, "tsr_explorer"))
assert_that(
(is.character(sample_1) && length(sample_1) == 2) &&
has_name(sample_1, c("TSS", "TSR"))
)
assert_that(
(is.character(sample_2) && length(sample_2) == 2) &&
has_name(sample_2, c("TSS", "TSR"))
)
assert_that(
is.null(tss_threshold) ||
(is.numeric(tss_threshold) && tss_threshold >= 0)
)
assert_that(is.count(max_distance))
assert_that(is.count(min_threshold) && min_threshold > 5)
assert_that(is.integer(n_resamples) && n_resamples >= 100L)
assert_that(
is.numeric(fdr_cutoff) &&
(fdr_cutoff <= 1 & fdr_cutoff > 0)
)
assert_that(is.string(comparison_name))
## Retrieve TSSs and TSRs.
TSSs <- extract_counts(experiment, "tss", c(sample_1["TSS"], sample_2["TSS"]))
TSRs <- extract_counts(experiment, "tsr", c(sample_1["TSR"], sample_2["TSR"]))
## Merge TSRs into consensus ranges.
consensus_TSRs <- TSRs %>%
map(as_granges) %>%
bind_ranges %>%
GenomicRanges::reduce(min.gapwidth=max_distance, ignore.strand=FALSE) %>%
as.data.table(key=c("seqnames", "strand", "start", "end"))
consensus_TSRs[, FHASH := str_c(seqnames, start, end, strand, sep=":")]
## Remove consensus ranges width a width of 1.
consensus_TSRs <- consensus_TSRs[start != end]
## Prepare TSSs for overlap with consensus ranges.
TSSs <- rbindlist(TSSs, idcol="sample")
setkey(TSSs, seqnames, strand, start, end)
## Filter out TSSs below threshold if requested.
if (!is.null(tss_threshold)) {
TSSs <- TSSs[score >= tss_threshold]
}
# Associate TSSs with consensus ranges.
overlap <- foverlaps(TSSs, consensus_TSRs)
overlap <- overlap[!is.na(start)]
## Filter out TSRs without TSSs in both samples.
overlap[, count := uniqueN(sample), by=FHASH]
overlap <- overlap[count == 2, .(sample, seqnames, start=i.start, end=i.end, strand, score, FHASH)]
## Get relative distances of each TSS in a TSR.
overlap[, distance := ifelse(strand == "+", start - min(start), max(start) - start), by=FHASH]
## Prepare table for shift score calculation.
overlap <- overlap %>%
as_granges %>%
sort %>%
as.data.table
overlap <- overlap[, .(sample, seqnames, start, end, strand, score, FHASH, distance)]
## Calculate the shift scores.
shifts <- ShiftScores(
overlap,
baseline_level=sample_1["TSS"],
nresamp=n_resamples,
nthresh=min_threshold
)
## p-value correction for multiple comparisons.
setDT(shifts)
shifts <- melt(
shifts, measure.vars=c("shift_score_pval", "emd_pval"),
variable.name="analysis", value.name="pval"
)
shifts[, c("FDR", "analysis") := list(
p.adjust(pval, "fdr"),
stringr::str_replace(analysis, "_pval$", "")
)]
shifts <- tidyr::pivot_wider(
shifts, names_from=analysis, values_from=c(pval, FDR),
names_glue="{analysis}_{.value}"
)
setDT(shifts)
shifts <- shifts[order(shift_score_FDR)]
setcolorder(shifts, c(
"seqnames", "start", "end", "strand", "pos_component",
"neg_component", "shift_score", "shift_score_pval", "shift_score_FDR",
"emd", "emd_pval", "emd_FDR"
))
## Filter out non-significant results.
shifts <- shifts[shift_score_FDR < fdr_cutoff | emd_FDR < fdr_cutoff]
## Annotate results.
shifts[, c("shift_status", "emd_status") := list(
case_when(
shift_score > 0 & shift_score_FDR < fdr_cutoff ~ "downstream",
shift_score < 0 & shift_score_FDR < fdr_cutoff ~ "upstream",
shift_score == 0 & shift_score_FDR < fdr_cutoff ~ "balanced",
TRUE ~ "n.s."
),
ifelse(emd_FDR < fdr_cutoff, "changed", "n.s.")
)]
## Add results to TSRexploreR object.
shifts[, c("start", "end") := list(as.numeric(start), as.numeric(end))]
experiment@shifting$results[[comparison_name]] <- shifts
return(experiment)
}
#' Shift Score
#'
#' Calculate shift scores and associated permutation test p-values.
#'
#' @importFrom Rcpp sourceCpp
#'
#' @param tss_table Table of TSSs prepared for shifting score calculation.
#' @param baseline_level The sample being used as the baseline for calculation.
#' @param calc_pvalue Whether p-values should be returned for comparisons.
#' @param nresamp Number of resamplings for permutation test.
#' @param nthresh Minimum number of raw counts required in each TSR for both samples.
#' @param check_sort Check that the input is sorted properly.
#'
#' @rdname ShiftScores-function
#' @export
ShiftScores <- function(
tss_table,
baseline_level,
calc_pvalue=TRUE,
nresamp=100L,
nthresh=2,
check_sort=TRUE
){
dat <- tss_table %>%
as_tibble %>%
dplyr::select(fhash=FHASH, sample_indicator=sample, distances=distance, scores=score)
# Assumes fhash is consecutive, no regrouping necessary.
# Assumes there are only two samples.
dat <- dplyr::mutate(
dat, sample_indicator=as.integer(sample_indicator==baseline_level)
)
if(check_sort) dat <- dplyr::arrange(dat, fhash, sample_indicator, distances)
## Filter out TSRs where one of the samples has too low a score.
out_frame <- dat %>%
dplyr::group_by(fhash, sample_indicator) %>%
dplyr::summarise(sum_score=sum(scores)) %>%
dplyr::mutate(toosmall= sum_score < nthresh)
if(sum(out_frame$toosmall) > 0){
warning("Some sequences have fewer than nthresh scores for at least one sample.
These are ignored and returned as NA.")
}
dat <- dat %>%
dplyr::left_join(out_frame) %>%
dplyr::group_by(fhash) %>%
dplyr::filter(!any(toosmall)) %>%
dplyr::ungroup()
## Filter out TSRs where only one sample is present.
dat <- dat %>%
dplyr::group_by(fhash) %>%
dplyr::filter(dplyr::n_distinct(sample_indicator) == 2) %>%
dplyr::ungroup()
## Calculate the shift score.
out <- with(
dat, ## returns a 2 by n_distinct matrix
allTheShiftScores(
fhash, distances, scores, sample_indicator,
as.integer(calc_pvalue), as.integer(nresamp), dplyr::n_distinct(fhash)
)
)
## Change output to data.frame and add the coordinates back.
outdf <- out %>%
t %>%
`colnames<-`(c(
"shift_score", "pos_component", "neg_component",
"emd", "shift_score_pval","emd_pval"
)) %>%
as_tibble %>%
dplyr::bind_cols(dplyr::distinct(dat, fhash)) %>%
tidyr::separate(fhash, into=c("seqnames", "start", "end", "strand"), sep=":")
return(outdf)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.