# == title
# Find common genomic regions across samples
#
# == param
# -gr_list a list of `GenomicRanges::GRanges`
# -min_recurrency minimal cross-sample recurrency at each base for the common regions
# -gap gap to merge common regions, pass to `reduce2`
# -max_gap maximum gap for merging common regions, pass to `reduce2`
# -min_width minimal width for the common regions. It can be used to remove a lot of
# very short regions.
#
# == details
# A common region is defined as a region which is recurrent in at least k samples. The process of
# finding common regions are as follows:
#
# - merge regions in all samples into one object
# - calculate coverage which is the recurrency, removed regions with recurrency less than the cutoff
# - merge the segments and remove regions which are too short
#
# Please note in each sample, regions should not be overlapped.
#
# == value
# A `GenomicRanges::GRanges` object contains coordinates of common regions. The columns in meta data
# are percent of the common region which is covered by regions in every sample.
#
# == seealso
# The returned variable can be sent to `subgroup_specific_genomic_regions` to find subgroup specific regions.
#
# == author
# Zuguang Gu <z.gu@dkfz.de>
#
# == example
# gr_list = list(
# gr1 = GRanges(seqnames = "chr1", ranges = IRanges(1, 8)),
# gr2 = GRanges(seqnames = "chr1", ranges = IRanges(3, 9)),
# gr3 = GRanges(seqnames = "chr1", ranges = IRanges(2, 7)),
# gr4 = GRanges(seqnames = "chr1", ranges = IRanges(c(1, 6), c(4, 10))),
# gr5 = GRanges(seqnames = "chr1", ranges = IRanges(c(1, 9), c(3, 10)))
# )
# common_regions(gr_list, min_recurrency = 4, gap = bp(1))
# common_regions(gr_list, min_recurrency = 4, gap = 0.5)
common_regions = function(gr_list, min_recurrency = floor(length(gr_list)/4),
gap = bp(1000), max_gap = Inf, min_width = 0) {
# merge all gr into one data frame
gr_merge = gr_list[[1]]
for(i in seq_along(gr_list)[-1]) {
gr_merge = c(gr_merge, gr_list[[i]])
message(qq("merging @{i}/@{length(gr_list)} samples"))
}
message("calculating cross-sample coverage")
cov = coverage(gr_merge)
gr_cov = as(cov, "GRanges")
gr_common = reduce2(gr_cov[gr_cov$score >= min_recurrency], gap = gap, max_gap = max_gap)
mcols(gr_common) = NULL
if(length(gr_common) == 0) {
message("No common region has been detected.")
return(NULL)
}
gr_common = gr_common[ width(gr_common) >= min_width ]
message(qq("there are @{length(gr_common)} common regions"))
# calculate the percentage for each CR covered by gr_list
message("overlapping `gr_list` to common regions.")
gr_common = annotate_to_genomic_features(gr_common, gr_list, type = "percent")
return(gr_common)
}
# == title
# Find subgroup specific regions
#
# == param
# -gr a `GenomicRanges::GRanges` object generated by `common_regions`
# -subgroup subgroup information which corresponds to samples in ``gr``
# -present how to define a common region that is specifically present in one subgroup.
# The subgroup specificity is calculated based on the precent matrix stored in ``gr``.
# For each subgroup which is defined in ``subgroup``,
# if ``present`` is a single numeric value, it means the mean value should be larger than it.
# It can also be a function for which the input is the vector of percent in corresponding subgroup
# and the output should be a single logical value.
# -absent how to define a common region is specifically absent in one subgroup. Format is same as ``present``
# -type It uses a string containing 1 and 0 to represent types of specificity. E.g. '1100' means
# present in subgroup 1 and 2 while absent in subgroup 3 and 4. By default, it will output
# all combination of subgroup specificities.
#
# == value
# This function splits the original ``gr`` into a list in which each element
# contains regions corresponding to different subgroup specificity.
#
# == seealso
# The returned value can be sent to `heatmap_subgroup_specificity` to visualize the specificity.
#
# == author
# Zuguang Gu <z.gu@dkfz.de>
#
# == example
# \dontrun{
# # following two calls are basically the same
# subgroup_specific_genomic_regions(gr, subgroup,
# present = 0.7, absent = 0.3)
# subgroup_specific_genomic_regions(gr, subgroup,
# present = function(x) mean(x) >= 0.7,
# absent = function(x) mean(x) <= 0.3)
# }
# NULL
subgroup_specific_genomic_regions = function(gr, subgroup, present = 0.7, absent = 0.3, type = NULL) {
if(length(subgroup) != ncol(mcols(gr))) {
stop("Length of `subgroup` should be same as number of samples in `gr`\n")
}
level = unique(as.character(subgroup))
if(is.null(type)) {
ss = expand.grid(rep(list(0:1), length(level)))
n_ss = nrow(ss)
ss = ss[-c(1, n_ss), , drop = FALSE]
type = apply(ss, 1, paste, collapse = "")
} else {
ss = as.matrix(as.data.frame(strsplit(type, "")))
ss = t(ss)
dm = dim(ss)
ss = as.integer(ss)
dim(ss) = dm
if(ncol(ss) != length(level)) {
stop("`nchar` of `type` should be same as the number of levels.\n")
}
}
mat = mcols(gr)[, grepl("^overlap_to_", colnames(mcols(gr)))]
mat = as.matrix(mat)
gr_list = vector("list", length(type))
names(gr_list) = type
for(i in seq_along(type)) {
message(qq("extracting subgroup: '@{type[i]}', "), appendLF = FALSE)
l = sapply(seq_len(nrow(mat)), function(k) {
x = mat[k, ]
for(j in seq_along(level)) {
x2 = x[subgroup == level[j]]
if(ss[i, j]) { # present
if(is.function(present)) {
l2 = present(x2)
} else {
l2 = mean(x2) >= present
}
} else { # absent
if(is.function(absent)) {
l2 = absent(x2)
} else {
l2 = mean(x2) <= absent
}
}
if(!l2) {
return(FALSE)
}
}
return(TRUE)
})
message(qq("@{sum(l)} regions"))
gr_list[[i]] = gr[l]
}
attr(gr_list, "subgroup") = subgroup
class(gr_list) = c("subgroup_specific_genomic_regions", "list")
return(gr_list)
}
# == title
# Print subgroup_specific_genomic_regions class object
#
# == param
# -x a `subgroup_specific_genomic_regions` class object
# -... additional arguments
#
# == value
# no value is returned
#
# == author
# ZUguang Gu <z.gu@dkfz.de>
print.subgroup_specific_genomic_regions = function(x, ...) {
subgroup = attr(x, "subgroup")
level = unique(subgroup)
qqcat("Subgroups: @{paste(level, collapse = ', ')} with following specificity patterns:\n")
len = sapply(x, length)
for(nm in names(len)) {
qqcat("- @{nm}: @{len[nm]} regions\n")
}
nm = names(len)
l = grepl("0", nm) & grepl("1", nm) & len > 0
if(sum(l) == 0) {
l = len > 0
}
i = which(l)[1]
qqcat("The digits mean, e.g. @{nm[i]}, ")
digits = strsplit(nm[i], "")[[1]]
for(k in seq_along(digits)) {
if(digits[k] == 0) {
qqcat("absent in @{level[k]}@{ifelse(k == length(digits), '', ', ')}")
} else {
qqcat("present in @{level[k]}@{ifelse(k == length(digits), '', ', ')}")
}
}
qqcat("\n")
}
# == title
# Subset the subgroup_specific_genomic_regions class object
#
# == param
# -x a `subgroup_specific_genomic_regions` object
# -i index
#
# == value
# No value is returned
#
# == author
# Zuguang Gu <z.gu@dkfz.de>
"[.subgroup_specific_genomic_regions" = function(x, i) {
subgroup = attr(x, "subgroup")
class(x) = NULL
y = x[i]
attr(y, "subgroup") = subgroup
class(y) = c("subgroup_specific_genomic_regions", "list")
y
}
# == title
# Heatmap for visualizing subgroup specific genomic regions
#
# == param
# -gr_list a list of `GenomicRanges::GRanges` objects which is generated by `subgroup_specific_genomic_regions`
# -genomic_features Genomic features that are used for annotating regions in ``gr_list``, it should be a list or a single `GenomicRanges::GRanges` objects
# -ha the default column annotation is the subgroups used in `subgroup_specific_genomic_regions` step.
# The value should be a `ComplexHeatmap::HeatmapAnnotation-class` object.
# -... pass to `ComplexHeatmap::draw,HeatmapList-method`
#
# == details
# Columns are clustered within each subgroup and rows are clustered for each type of specificity.
#
# == value
# A `ComplexHeatmap::HeatmapList-class` object
#
# == author
# Zuguang Gu <z.gu@dkfz.de>
#
heatmap_subgroup_specificity = function(gr_list, genomic_features = NULL,
ha = HeatmapAnnotation(subgroup = attr(gr_list, "subgroup")), ...) {
if(!inherits(gr_list, "subgroup_specific_genomic_regions")) {
stop("`gr_list` should come from `subgroup_specific_genomic_regions()`.")
}
subgroup = attr(gr_list, "subgroup")
# column name
cn = gsub("^overlap_to_", "", grep("^overlap_to_", colnames(mcols(gr_list[[1]])), value = TRUE))
level = unique(subgroup)
mat = NULL
gr_combine = GRanges()
type = NULL
gr_list_name = names(gr_list)
for(i in seq_along(gr_list)) {
if(length(gr_list[[i]]) == 0) next
sub_matrix = mcols(gr_list[[i]])[, grepl("^overlap_to_", colnames(mcols(gr_list[[i]])))]
sub_matrix = as.matrix(sub_matrix)
colnames(sub_matrix) = cn
if(length(gr_list[[i]] == 1)) {
mat = rbind(mat, sub_matrix)
gr_combine = c(gr_combine, gr_list[[i]])
type = c(type, rep(gr_list_name[i], length(gr_list[[i]])))
} else {
# cluster rows for each sub-matrix
message(qq("cluster rows for sub-matrix @{gr_list_name[i]} (@{length(gr_list[[i]])} rows)."))
if(nrow(sub_matrix) == 1) {
rclust = list(order = 1)
} else {
rclust = hclust(dist(sub_matrix))
}
mat = rbind(mat, sub_matrix[rclust$order, , drop = FALSE])
gr_combine = c(gr_combine, gr_list[[i]][rclust$order])
type = c(type, rep(gr_list_name[i], length(gr_list[[i]])))
}
}
mcols(gr_combine) = NULL
# globally cluster inside each subgroup
mat2 = NULL
for(le in level) {
message(qq("cluster columns for samples in subgroup @{le}."))
m = mat[, subgroup == le, drop = FALSE]
if(ncol(m) > 1) {
cclust = hclust(dist(t(m)))
mat2 = cbind(mat2, m[, cclust$order])
} else {
mat2 = cbind(mat2, m)
}
}
type_level = unique(type)
n_type = length(type_level)
if(n_type <= 9) {
type_col = brewer.pal(9, name = "Set1")[1:n_type]
} else if(n_type <= 9+7) {
type_col = c(brewer.pal(9, name = "Set1"), brewer.pal(7, "Set2"))[1:n_type]
} else {
type_col = c(brewer.pal(9, name = "Set1"), brewer.pal(7, "Set2"), rand_color(n_type - 16))
}
names(type_col) = type_level
ht_list = Heatmap(type, name = "type", col = type_col, width = unit(4, "mm"))
ht_list = ht_list + Heatmap(mat2, name = "pct", col = colorRamp2(c(0, 1), c("white", "blue")),
top_annotation = ha,
cluster_rows = FALSE, cluster_columns = FALSE, split = type,
use_raster = nrow(mat2) > 2500, raster_quality = 2)
len = width(gr_combine)
len[len > quantile(len, 0.95)] = quantile(len, 0.95)
ht_list = ht_list + rowAnnotation(length = row_anno_points(len, axis = TRUE, axis_side = "top", size = unit(1, "mm"), gp = gpar(col = "#00000040")), width = unit(2, "cm"),
show_annotation_name = TRUE)
if(!is.null(genomic_features)) {
if(inherits(genomic_features, "GRanges")) {
gf_name = deparse(substitute(genomic_features))
genomic_features = list(genomic_features)
names(genomic_features) = gf_name
}
gr_combine = annotate_to_genomic_features(gr_combine, genomic_features, prefix = "")
ht_list = ht_list + Heatmap(as.matrix(mcols(gr_combine)), name = "anno", col = colorRamp2(c(0, 1), c("white", "orange")),
cluster_columns = FALSE, use_raster = nrow(mat2) > 2500, raster_quality = 2,
width = unit(4, "mm")*length(genomic_features))
}
message(qq("generating heatmap, (nrow = @{nrow(mat2)}, ncol = @{ncol(mat2)})"))
draw(ht_list, ...)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.