###################################################################################################
# This tests the counting capabilities of various functions for DNase-C data.
Sys.setlocale(category="LC_COLLATE",locale="C")
dir.create("temp-dna")
file1 <- "temp-dna/1.h5"
file2 <- "temp-dna/2.h5"
suppressPackageStartupMessages(library(diffHic))
simDNA <- function(fout, chrs, npairs, rlen) {
r1 <- sample(length(chrs), npairs, replace=TRUE)
r2 <- sample(length(chrs), npairs, replace=TRUE)
p1 <- as.integer(runif(npairs, 1, chrs[r1] + 1))
p2 <- as.integer(runif(npairs, 1, chrs[r2] + 1))
l1 <- ifelse(rbinom(npairs, 1, 0.5)==1L, 1L, -1L)*rlen
l2 <- ifelse(rbinom(npairs, 1, 0.5)==1L, 1L, -1L)*rlen
savePairs(data.frame(anchor1.id=r1, anchor2.id=r2, anchor1.pos=p1, anchor2.pos=p2, anchor1.len=l1, anchor2.len=l2),
fout, param=pairParam( GRanges(seqlengths=chrs) ))
return(invisible(NULL))
}
comp <- function(chrs, npairs1, npairs2, dist, rlen=10, filter=1L, restrict=NULL, cap=NA) {
simDNA(file1, chrs, npairs1, rlen)
simDNA(file2, chrs, npairs2, rlen)
# Output of squares.
param <- pairParam( GRanges(seqlengths=chrs), restrict=restrict, cap=cap)
y <- squareCounts(c(file1, file2), param=param, width=dist, filter=filter)
# Reference. First, getting all bins.
bin.coords <- list()
for (i in names(chrs)) {
nbins <- ceiling(chrs[[i]]/dist)
bin.ends <- pmin(chrs[[i]], seq_len(nbins)*dist)
bin.starts <- c(1, head(bin.ends, -1)+1)
bin.coords[[i]] <- GRanges(i, IRanges(bin.starts, bin.ends))
}
bin.offset <- c(0L, cumsum(lengths(bin.coords)))
names(bin.coords) <- NULL
suppressWarnings(bin.coords <- do.call(c, bin.coords))
seqlengths(bin.coords) <- chrs
bin.coords$nfrags <- 0L
stopifnot(identical(bin.coords, regions(y)))
# Now running through all bin pairs and assembling an InteractionSet.
collected.isets <- list()
collected.margins <- list()
collected.totals <- list()
for (f in 1:2) {
curf <- c(file1, file2)[f]
fmat <- matrix(0L, length(bin.coords), length(bin.coords))
total <- 0L
for (i in seq_along(chrs)) {
for (j in seq_len(i)) {
cur.i <- names(chrs)[i]
cur.j <- names(chrs)[j]
if (!is.null(restrict) && (!cur.i %in% restrict || !cur.j %in% restrict)) { next }
curdat <- loadData(curf, cur.i, cur.j)
total <- total+ nrow(curdat)
p1 <- curdat$anchor1.pos + ifelse(curdat$anchor1.len > 0, 0L, -curdat$anchor1.len-1L)
p1 <- pmin(p1, chrs[i])
p2 <- curdat$anchor2.pos + ifelse(curdat$anchor2.len > 0, 0L, -curdat$anchor2.len-1L)
p2 <- pmin(p2, chrs[j])
b1 <- ceiling(p1/dist) + bin.offset[i]
b2 <- ceiling(p2/dist) + bin.offset[j]
for (x in seq_along(b1)) {
fmat[b1[x], b2[x]] <- fmat[b1[x], b2[x]] + 1L
if (b1[x]!=b2[x]) {
fmat[b2[x], b1[x]] <- fmat[b2[x], b1[x]] + 1L
}
}
}
}
cm <- ContactMatrix(fmat, seq_along(bin.coords), seq_along(bin.coords), regions=bin.coords)
extractor <- upper.tri(fmat, diag=TRUE)
is <- deflate(cm, extract=extractor)
collected.isets[[f]] <- is
collected.margins[[f]] <- as.integer(rowSums(fmat) + diag(fmat)) # diagonal gets counted twice.
collected.totals[[f]] <- total
}
ref <- do.call(cbind, collected.isets)
ref <- ref[rowSums(assay(ref)) >= filter,]
interactions(ref) <- as(interactions(ref), "ReverseStrictGInteractions")
storage.mode(assay(ref)) <- "integer"
colnames(ref) <- NULL
# Checking if interactions and counts are equal.
m <- match(y, ref)
stopifnot(identical(assay(ref)[m,], assay(y)))
stopifnot(all(!is.na(m)))
stopifnot(!anyDuplicated(m))
stopifnot(nrow(y)==nrow(ref))
# Checking the totals.
stopifnot(identical(y$totals, as.integer(unlist(collected.totals))))
if (is.null(restrict)) {
stopifnot(identical(y$totals, as.integer(c(npairs1, npairs2))))
}
totes <- totalCounts(c(file1, file2), param=param)
stopifnot(identical(totes, y$totals))
# Checking for restriction.
if (!is.null(restrict)) {
y.alt <- squareCounts(c(file1, file2), param=param, width=dist, filter=filter, restrict.regions=TRUE)
if (!identical(assay(y), assay(y.alt)) || !identical(anchors(y), anchors(y.alt)) ||
any(! seqlevelsInUse(regions(y.alt)) %in% restrict)) {
stop("restrict.regions=TRUE in squareCounts doesn't work for DNase-C data")
}
}
# Checking if the marginal counts... add up.
mrg <- marginCounts(c(file1, file2), param=param, width=dist)
out <- assay(mrg)
dimnames(out) <- NULL
stopifnot(identical(out, do.call(cbind, collected.margins)))
stopifnot(identical(bin.coords, rowRanges(mrg)))
# Checking that the neighborhood gives the same output.
nbr <- neighborCounts(c(file1, file2), param=param, width=dist, filter=filter, flank=5)
stopifnot(identical(assay(nbr), assay(y)))
stopifnot(all(interactions(nbr)==interactions(y)))
return(head(assay(y)))
}
set.seed(234711)
chrs <- c(chrA=1000, chrB=2000)
comp(chrs, 100, 200, 100, rlen=10, filter=1L, restrict=NULL, cap=NA)
comp(chrs, 200, 200, 100, rlen=10, filter=1L, restrict=NULL, cap=NA)
comp(chrs, 100, 200, 75, rlen=10, filter=1L, restrict=NULL, cap=NA)
comp(chrs, 100, 200, 220, rlen=10, filter=1L, restrict=NULL, cap=NA)
comp(chrs, 500, 500, 75, rlen=10, filter=5L, restrict=NULL, cap=NA)
comp(chrs, 500, 500, 100, rlen=10, filter=5L, restrict=NULL, cap=NA)
comp(chrs, 500, 500, 150, rlen=10, filter=5L, restrict=NULL, cap=NA)
comp(chrs, 500, 500, 200, rlen=10, filter=5L, restrict=NULL, cap=NA)
comp(chrs, 500, 500, 75, rlen=50, filter=1L, restrict=NULL, cap=NA)
comp(chrs, 500, 500, 100, rlen=50, filter=1L, restrict=NULL, cap=NA)
comp(chrs, 500, 500, 150, rlen=50, filter=1L, restrict=NULL, cap=NA)
comp(chrs, 500, 500, 200, rlen=50, filter=1L, restrict=NULL, cap=NA)
comp(chrs, 500, 500, 75, rlen=10, filter=1L, restrict="chrA", cap=NA)
comp(chrs, 500, 500, 100, rlen=10, filter=1L, restrict="chrB", cap=NA)
comp(chrs, 500, 500, 150, rlen=10, filter=1L, restrict="chrA", cap=NA)
comp(chrs, 500, 500, 200, rlen=10, filter=1L, restrict="chrB", cap=NA)
comp(chrs, 500, 500, 75, rlen=10, filter=1L, restrict=NULL, cap=5) # Should have no effect.
comp(chrs, 500, 500, 100, rlen=10, filter=1L, restrict=NULL, cap=5)
comp(chrs, 500, 500, 150, rlen=10, filter=1L, restrict=NULL, cap=5)
comp(chrs, 500, 500, 200, rlen=10, filter=1L, restrict=NULL, cap=5)
chrs <- c(chrA=1000, chrB=1000, chrC=1000) # Trying for more chromosomes.
comp(chrs, 500, 500, 75, rlen=10, filter=1L, restrict=NULL, cap=NA)
comp(chrs, 500, 500, 100, rlen=10, filter=1L, restrict=NULL, cap=NA)
comp(chrs, 500, 500, 150, rlen=10, filter=1L, restrict=NULL, cap=NA)
comp(chrs, 500, 500, 200, rlen=10, filter=1L, restrict=NULL, cap=NA)
##################################################################################################
# Repeating the tests for connectCounts
simranges <- function(chrs, nranges, size.range=c(20, 100))
# Generates simulated ranges.
{
ranges <- list()
for (chr in names(chrs)) {
chr.len <- chrs[[chr]]
range.start <- round(runif(nranges, 1, chr.len))
range.end <- pmin(chr.len, round(range.start + runif(nranges, size.range[1], size.range[2])))
ranges[[chr]] <- GRanges(chr, IRanges(range.start, range.end))
}
names(ranges) <- NULL
suppressWarnings(ranges <- do.call(c, ranges))
seqlengths(ranges) <- chrs
return(ranges)
}
concomp <- function(chrs, npairs1, npairs2, regions, rlen=10, filter=1L, restrict=NULL, cap=NA, seconds=NULL, type="any") {
simDNA(file1, chrs, npairs1, rlen)
simDNA(file2, chrs, npairs2, rlen)
# Output of running connectCounts
param <- pairParam( GRanges(seqlengths=chrs), restrict=restrict, cap=cap)
y <- connectCounts(c(file1, file2), param=param, regions, filter=1L, second.regions=seconds, type=type)
y.matches <- do.call(paste, anchors(y, id=TRUE))
y.counts <- assay(y)
if (filter>1L) {
y.filt <- connectCounts(c(file1, file2), param=param, regions=regions, filter=filter, second.regions=seconds, type=type)
stopifnot(isTRUE(all.equal(y.filt, y[rowSums(assay(y)) >= filter,])))
}
if (!is.null(restrict)) {
y.alt <- connectCounts(c(file1, file2), param=param, regions=regions, filter=1L, second.regions=seconds, type=type, restrict.regions=TRUE)
if (!identical(assay(y), assay(y.alt)) || !identical(anchors(y), anchors(y.alt)) ||
any(! seqlevelsInUse(regions(y.alt)) %in% restrict)) {
stop("restrict.regions=TRUE in connectCounts doesn't work for DNase-C data")
}
}
# Reference block - first, making the regions.
regions$nfrags <- 0L
regions$original <- seq_along(regions)
if (!is.null(seconds)) {
if (is.numeric(seconds)) {
extras <- diffHic:::.createBins(param, seconds)$region
extras$original <- NA_integer_
} else {
extras <- seconds
extras$nfrags <- 0L
extras$original <- seq_along(extras)
}
regions$is.second <- FALSE
extras$is.second <- TRUE
suppressWarnings(regions <- c(regions, extras))
}
regions <- sort(regions)
stopifnot(all(regions==regions(y)))
stopifnot(identical(regions$is.second, regions(y)$is.second))
stopifnot(identical(regions$original, regions(y)$original))
# Setting up the combining function.
secondary <- regions$is.second
combFUN <- function(hits1, hits2) {
ex1 <- rep(hits1, each=length(hits2))
ex2 <- rep(hits2, length(hits1))
if (!is.null(secondary)) {
keep <- secondary[ex1]!=secondary[ex2]
ex1 <- ex1[keep]
ex2 <- ex2[keep]
}
out1 <- pmax(ex1, ex2)
out2 <- pmin(ex1, ex2)
unique(paste(out1, out2))
}
# Running through them and making sure they match up.
for (f in 1:2) {
curf <- c(file1, file2)[f]
collected <- list()
for (i in seq_along(chrs)) {
for (j in seq_len(i)) {
cur.i <- names(chrs)[i]
cur.j <- names(chrs)[j]
if (!is.null(restrict) && (!cur.i %in% restrict || !cur.j %in% restrict)) { next }
curdat <- loadData(curf, cur.i, cur.j)
r1 <- GRanges(cur.i, IRanges(curdat$anchor1.pos, width=abs(curdat$anchor1.len)))
r2 <- GRanges(cur.j, IRanges(curdat$anchor2.pos, width=abs(curdat$anchor2.len)))
olap1 <- findOverlaps(r1, regions, type=type)
olap2 <- findOverlaps(r2, regions, type=type)
for (x in seq_len(nrow(curdat))) {
colap1 <- subjectHits(olap1)[queryHits(olap1)==x]
colap2 <- subjectHits(olap2)[queryHits(olap2)==x]
if (!length(colap1) || !length(colap2)) next
collected[[length(collected)+1L]] <- combFUN(colap1, colap2)
}
}
}
collected <- unlist(collected)
col.counts <- table(collected)
m <- match(names(col.counts), y.matches)
stopifnot(all(!is.na(m)))
y.counts[m,f] <- y.counts[m,f] - col.counts
}
stopifnot(all(y.counts==0L))
return(head(assay(y)))
}
set.seed(1012)
chrs <- c(chrA=1000, chrB=2000)
regs <- simranges(chrs, 10)
concomp(chrs, 100, 100, regs, rlen=10L, filter=1L, restrict=NULL, cap=NA)
concomp(chrs, 100, 100, regs, rlen=50L, filter=1L, restrict=NULL, cap=NA)
concomp(chrs, 100, 100, regs, rlen=10L, filter=1L, restrict="chrB", cap=NA)
concomp(chrs, 100, 100, regs, rlen=10L, filter=1L, restrict=NULL, cap=1L)
concomp(chrs, 100, 100, regs, rlen=10L, filter=2L, restrict=NULL, cap=NA)
# Checking other types of overlaps
concomp(chrs, 100, 100, regs, rlen=10L, filter=1L, restrict=NULL, cap=NA, type="within")
concomp(chrs, 100, 100, regs, rlen=50L, filter=1L, restrict=NULL, cap=NA, type="within")
concomp(chrs, 100, 100, regs, rlen=10L, filter=1L, restrict="chrB", cap=NA, type="within")
concomp(chrs, 100, 100, regs, rlen=10L, filter=1L, restrict=NULL, cap=1L, type="within")
concomp(chrs, 100, 100, regs, rlen=10L, filter=2L, restrict=NULL, cap=NA, type="within")
# Checking with bigger ranges.
regs <- simranges(chrs, 10, size.range=c(100, 500))
concomp(chrs, 100, 100, regs, rlen=10L, filter=1L, restrict=NULL, cap=NA)
concomp(chrs, 100, 100, regs, rlen=50L, filter=1L, restrict=NULL, cap=NA)
concomp(chrs, 100, 100, regs, rlen=10L, filter=1L, restrict="chrB", cap=NA)
concomp(chrs, 100, 100, regs, rlen=10L, filter=1L, restrict=NULL, cap=1L)
concomp(chrs, 100, 100, regs, rlen=10L, filter=2L, restrict=NULL, cap=NA)
# Checking with more ranges.
chrs <- c(chrA=1000, chrB=1000, chrC=1000)
regs <- simranges(chrs, 20)
concomp(chrs, 100, 100, regs, rlen=10L, filter=1L, restrict=NULL, cap=NA)
concomp(chrs, 100, 100, regs, rlen=50L, filter=1L, restrict=NULL, cap=NA)
concomp(chrs, 100, 100, regs, rlen=10L, filter=1L, restrict="chrB", cap=NA)
concomp(chrs, 100, 100, regs, rlen=10L, filter=1L, restrict=NULL, cap=1L)
concomp(chrs, 100, 100, regs, rlen=10L, filter=2L, restrict=NULL, cap=NA)
# Checking with secondary regions.
chrs <- c(chrA=1000, chrB=2000)
regs <- simranges(chrs, 10)
regs2 <- simranges(chrs, 10)
concomp(chrs, 100, 100, regs, rlen=10L, filter=1L, restrict=NULL, cap=NA, seconds=regs2)
concomp(chrs, 100, 100, regs, rlen=50L, filter=1L, restrict=NULL, cap=NA, seconds=regs2)
concomp(chrs, 100, 100, regs, rlen=10L, filter=1L, restrict="chrB", cap=NA, seconds=regs2)
concomp(chrs, 100, 100, regs, rlen=10L, filter=1L, restrict=NULL, cap=1L, seconds=regs2)
concomp(chrs, 100, 100, regs, rlen=10L, filter=2L, restrict=NULL, cap=NA, seconds=regs2)
concomp(chrs, 100, 100, regs, rlen=10L, filter=1L, restrict=NULL, cap=NA, seconds=100)
concomp(chrs, 100, 100, regs, rlen=10L, filter=1L, restrict=NULL, cap=NA, seconds=200)
concomp(chrs, 100, 100, regs, rlen=10L, filter=1L, restrict=NULL, cap=NA, seconds=500)
##################################################################################################
# Miscellaneous bits and pieces, to check they work properly with DNase-C data.
# boxPairs.
set.seed(34234)
chrs <- c(chrA=1000, chrB=2000)
simDNA(file1, chrs, 1000, 10)
simDNA(file2, chrs, 1000, 10)
param <- pairParam( GRanges(seqlengths=chrs))
y1 <- squareCounts(c(file1, file2), param=param, width=100, filter=1L)
y2 <- squareCounts(c(file1, file2), param=param, width=200, filter=1L)
checkFUN <- function(y, box) {
olap <- findOverlaps(y, box, type="within")
stopifnot(identical(queryHits(olap), seq_len(nrow(y))))
return(subjectHits(olap))
}
out <- boxPairs(y1, y2)
out$interactions
stopifnot(identical(out$indices[[1]], checkFUN(y1, out$interactions)))
stopifnot(identical(out$indices[[2]], checkFUN(y2, out$interactions)))
out <- boxPairs(y1, y2, reference=400)
out$interactions
stopifnot(identical(out$indices[[1]], checkFUN(y1, out$interactions)))
stopifnot(identical(out$indices[[2]], checkFUN(y2, out$interactions)))
# Patch extraction.
yref <- squareCounts(file1, param=param, width=100, filter=1L)
dummy.1 <- resize(regions(yref)[1], width=200)
dummy.2 <- resize(regions(yref)[length(regions(yref))], fix="end", width=200)
for (cur.dummy in list(dummy.1, dummy.2)) {
patch <- extractPatch(file1, param, cur.dummy, width=100)
ref <- yref[overlapsAny(yref, cur.dummy, use.region="first") & overlapsAny(yref, cur.dummy, use.region="second"),1]
stopifnot(identical(anchors(patch, id=TRUE), anchors(ref, id=TRUE)))
stopifnot(identical(regions(patch), regions(ref)))
stopifnot(identical(assay(patch), assay(ref)))
print(interactions(patch))
patch.alt <- extractPatch(file1, param, cur.dummy, width=100, restrict.regions=TRUE) # Checking restrict.regions= works.
stopifnot(identical(assay(patch), assay(patch.alt)))
stopifnot(identical(anchors(patch), anchors(patch.alt)))
stopifnot(all(seqnames(regions(patch.alt)) %in% seqnames(cur.dummy)))
}
patch <- extractPatch(file1, param, dummy.1, dummy.2, width=100) # Different regions this time.
ref <- yref[overlapsAny(yref, dummy.2, use.region="first") & overlapsAny(yref, dummy.1, use.region="second"),1]
stopifnot(identical(anchors(patch, id=TRUE), anchors(ref, id=TRUE)))
stopifnot(identical(regions(patch), regions(ref)))
stopifnot(identical(assay(patch), assay(ref)))
interactions(patch)
# getArea
head(getArea(y1))
head(getArea(y1, bp=FALSE))
# Plotting.
pdf("temp-dna/out.pdf")
plotPlaid(file1, param, first.region=GRanges("chrA", IRanges(1, 100)),
second.region=GRanges("chrA", IRanges(1, 200)), width=50, diag=TRUE)
plotPlaid(file1, param, first.region=GRanges("chrA", IRanges(1, 100)),
second.region=GRanges("chrB", IRanges(1, 200)), width=50, diag=TRUE)
rotPlaid(file1, param, region=GRanges("chrA", IRanges(1, 200)), width=50)
rotPlaid(file1, param, region=GRanges("chrB", IRanges(1, 200)), width=50)
dev.off()
##################################################################################################
# Cleaning up.
unlink("temp-dna", recursive=TRUE)
##################################################################################################
# End.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.