Nothing
#Copyright: Bori Gerle (gerle@ebi.ac.uk), EMBL-EBI, 2011
#set global variables
globalVariables(c("BSgenome.Hsapiens.UCSC.hg19", "V1", "chr1", "chr2", "filtered", "frequencies", "int1", "int2", "interactingLoci", "locus1", "locus2", "pvalue", "resGR"))
.onlyPairing <- function(fileName1, fileName2, sampleName, fileType)
{
#DUPLICATETHRESHOLD: maximum amount of duplicated paired-end reads allowed (over that value it is expected to be PCR bias)
if(fileType=="Table"){
reads1=read.table(fileName1, header=F, sep='\t', colClasses=c(rep('character', times=3), 'numeric', rep("NULL", times=4)))
reads1_1=GRanges(seqnames=reads1$V3, ranges=IRanges(start=reads1$V4, end=reads1$V4), strand=reads1$V2, id=sapply(strsplit(reads1$V1, ' '), '[[', 2))
reads2=read.table(fileName2, header=F, sep='\t', colClasses=c(rep('character', times=3), 'numeric', rep("NULL", times=4)))
reads2_1=GRanges(seqnames=reads2$V3, ranges=IRanges(start=reads2$V4, end=reads2$V4), strand=reads2$V2, id=sapply(strsplit(reads2$V1, ' '), '[[', 2))
id1=as.character(elementMetadata(reads1_1)$id)
id2=as.character(elementMetadata(reads2_1)$id)
ids1=id1
ids2=id2
}
if(fileType%in%c("Bowtie", "BAM")){
if(fileType=="Bowtie"){
reads1=readAligned(fileName1,type=fileType) ## returns identifier, strand, chromosome, position read, quality
reads1_1=as(reads1,"GRanges") ## Grange object containing 4 slots, seqnames, ranges, strand, elementMetadata (data.frame)
reads2=readAligned(fileName2,type=fileType)
reads2_1=as(reads2,"GRanges")
#match IDs to find paired reads where both ends were mapped
id1=as.character(elementMetadata(reads1_1)$id)
id2=as.character(elementMetadata(reads2_1)$id)
}
if(fileType=='BAM'){
what=c("qname","rname","pos","qwidth", "strand")
param=ScanBamParam(what=what)
reads1=scanBam(fileName1, param=param)
reads2=scanBam(fileName2, param=param)
id1=reads1[[1]]$qname
id2=reads2[[1]]$qname
reads1_1=GRanges(seqnames=reads1[[1]]$rname, ranges=IRanges(start=reads1[[1]]$pos, end=reads1[[1]]$pos+reads1[[1]]$qwidth), strand=reads1[[1]]$strand)
reads2_1=GRanges(seqnames=reads2[[1]]$rname, ranges=IRanges(start=reads2[[1]]$pos, end=reads2[[1]]$pos+reads2[[1]]$qwidth), strand=reads2[[1]]$strand)
if(length(grep('SRR',id1[1]))==1|length(grep('ERR',id1[1]))==1){
ids1=id1
ids2=id2
elementMetadata(reads1_1)$id = id1
elementMetadata(reads2_1)$id = id2
}else
{
ids1 = sapply(strsplit(id1, '\\-'), '[[', 2)
ids2 = sapply(strsplit(id2, '\\-'), '[[', 2)
elementMetadata(reads1_1)$id = ids1
elementMetadata(reads2_1)$id = ids2}
}else
{
#old fastq files use /1 and /2 to differentiate read pairs, CASAVA 1.8 separates members of a pair with spaces
firsttwo=sapply(strsplit(id1[1:2], '[/ ]'), '[[', 1)
if(firsttwo[1]==firsttwo[2]){
ids1 = sapply(strsplit(id1, ' '), '[[', 1)
ids2 = sapply(strsplit(id2, ' '), '[[', 1)
}else
{
ids1 = sapply(strsplit(id1, '[/ ]'), '[[', 1)
ids2 = sapply(strsplit(id2, '[/ ]'), '[[', 1)
}
}
}
rm(id1,id2)
s1=which(ids1%in%ids2)
s2=which(ids2%in%ids1)
paired_reads_1=reads1_1[s1]
rm(reads1_1)
paired_reads_1=paired_reads_1[order(elementMetadata(paired_reads_1)$id)]
paired_reads_2=reads2_1[s2]
rm(reads2_1)
paired_reads_2=paired_reads_2[order(elementMetadata(paired_reads_2)$id)]
paired_df=cbind(as.character(seqnames(paired_reads_1)),as.character(start(ranges(paired_reads_1))),as.character(end(ranges(paired_reads_1))),as.character(seqnames(paired_reads_2)),as.character(start(ranges(paired_reads_2))),as.character(end(ranges(paired_reads_2))))
colnames(paired_df)=c("chr1","start1","end1", "chr2","start2","end2")
#save paired reads in GRanges objects and data.frame
paired_df=data.frame(paired_df,frequencies=rep(1,times=nrow(paired_df)),stringsAsFactors=FALSE)
return(list(paired_reads_1=paired_reads_1, paired_reads_2=paired_reads_2, paired_df=paired_df,s1=s1,s2=s2))
}
.countDuplicatesOfRaw <- function(paired_df, sampleName)
{
#count how many times we have the exact same read pairs to be able to remove PCR duplicates
colus=c("chr1","start1","end1", "chr2","start2","end2")
vec <- do.call("paste",c(paired_df[colus],sep="_"))
names(vec) <- c(1:length(vec))
dup <- duplicated(vec)
uniq <-vec[!dup]
uniqfreq <- rep(1, times=length(uniq))
uniq=cbind(uniq,uniqfreq)
colnames(uniq)=c("interaction","freqs")
vec1 <- vec[dup]
vec1 <- sort(vec1)
freq <- cbind(vec1,as.numeric(sequence(rle(vec1)$lengths))+1)
colnames(freq)=c("interaction","freqs")
vec=rbind(uniq, freq)
vec <- vec[order(as.numeric(rownames(vec))),]
paired_df$frequencies <- as.numeric(vec[,2])
freqs <- as.numeric(vec[,2])
freqstable <- table(freqs)
l=c()
for(i in 1:length(freqstable-1)){
l[i]=freqstable[i]==freqstable[i+1]
}
p=freqstable[l]
p=p[1]
ths=as.numeric(names(p))
return(list(paired_df=paired_df,freqs=freqs,ths=ths))
}
.filterOutDuplicates <- function(paired_df, paired_reads_1, paired_reads_2, sampleName, DUPLICATETHRESHOLD)
{
#DUPLICATETHRESHOLD: maximum amount of duplicated paired-end reads allowed (over that value it is expected to be PCR bias)
#remove readpairs that are present in more replicates than the DUPLICATETHRESHOLD - keep as many of the reads as the threshold
dth <- which(paired_df$frequencies<(DUPLICATETHRESHOLD+1))
paired_reads_1<-paired_reads_1[dth]
paired_reads_2<-paired_reads_2[dth]
return(list(paired_reads_1=paired_reads_1,paired_reads_2=paired_reads_2))
}
pairReads <- function(fileName1, fileName2, sampleName, DUPLICATETHRESHOLD=1, fileType='BAM')
{
oP=.onlyPairing(fileName1, fileName2, sampleName, fileType)
paired_reads_1=oP$paired_reads_1
paired_reads_2=oP$paired_reads_2
paired_df=oP$paired_df
s1=oP$s1
s2=oP$s2
cDoR=.countDuplicatesOfRaw(paired_df, sampleName)
paired_df=cDoR$paired_df
freqs=cDoR$freqs
ths=cDoR$ths
filtered=.filterOutDuplicates(paired_df, paired_reads_1, paired_reads_2, sampleName, DUPLICATETHRESHOLD)
save(filtered,file=paste(sampleName,"paired_filtered",sep="_"))
return(filtered)
}
.getRestrictionSitesFromBSgenome <- function(BSgenomeName, genome, restrictionSite, enzyme)
{
#check if the genome needed is already installed and install from BioConductor if not
iG=installed.genomes()
if(!BSgenomeName%in%iG){
if (!requireNamespace("BiocManager", quietly=TRUE))
install.packages("BiocManager")
BiocManager::install(BSgenomeName)
}
library(BSgenomeName,character.only=TRUE)
chrEnds <- seqlengths(genome)
#find out how many base pairs from the 5 end of the restriction site the enzyme cuts
restRemain <- regexpr("\\^",restrictionSite)[1]-1
#restriction site as regular expression
rSite <- sub('\\^', '', restrictionSite)
#find locations where restriction enzyme cuts for all chromosomes
params <- new("BSParams",X=genome, FUN=matchPattern,simplify=TRUE)
resSite <- bsapply(params, pattern = rSite)
starts <- lapply(resSite, function(x){c(1,(start(x)+restRemain))})
lengths <- lapply(resSite, function(x){length(x)+1})
chrs <- lapply(1:length(names(resSite)),function(i){rep(names(resSite)[i],times=lengths[[i]])})
ends <- lapply(1:length(names(resSite)), function(i){c(start(resSite[[i]])+restRemain-1,chrEnds[i])})
starts <- unlist(starts)
chrs <- unlist(chrs)
ends <- unlist(ends)
#save restriction sites as GRanges
resGR <- GRanges(seqnames=chrs, ranges=IRanges(start=starts, end=ends))
save(resGR,file=paste("Digest",BSgenomeName,enzyme,sep="_"))
return(resGR)
}
#functions for parallel version of findOverlaps
.putRangesOnFirstCircle <- function(x, circle.length)
{
x_start0 <- start(x) - 1L # 0-based start
x_shift0 <- x_start0 %% circle.length - x_start0
shift(x, x_shift0)
}
.findOverlaps.circle <- function(circle.length, query, subject,
maxgap, minoverlap, type)
{
if (is.na(circle.length))
return(findOverlaps(query, subject,
maxgap=maxgap, minoverlap=minoverlap,
type=type, select="all"))
if (type != "any")
stop("overlap type \"", type, "\" is not yet supported ",
"for circular sequence ", names(circle.length))
subject0 <- .putRangesOnFirstCircle(subject, circle.length)
inttree0 <- NCList(subject0)
query0 <- .putRangesOnFirstCircle(query, circle.length)
hits00 <- findOverlaps(query0, inttree0,
maxgap=maxgap, minoverlap=minoverlap,
type=type, select="all")
query1 <- shift(query0, circle.length)
hits10 <- findOverlaps(query1, inttree0,
maxgap=maxgap, minoverlap=minoverlap,
type=type, select="all")
subject1 <- shift(subject0, circle.length)
hits01 <- findOverlaps(query0, subject1,
maxgap=maxgap, minoverlap=minoverlap,
type=type, select="all")
## Merge 'hits00', 'hits10' and 'hits01'.
union(union(hits00, hits10), hits01)
}
.strandAsSignedNumber <- function(x)
{
tmp <- as.integer(runValue(x))
idx <- tmp >= 2L
tmp[idx] <- tmp[idx] - 3L
runValue(x) <- tmp
as.vector(x)
}
.findOverlaps.parallel <- function(query, subject, maxgap=0L, minoverlap=1L,
type=c("any", "start", "end", "within", "equal"),
select=c("all", "first", "last", "arbitrary"),
ignore.strand=TRUE, mc.cores=1, mc.preschedule = TRUE)
{
if (!isSingleNumber(maxgap) || maxgap < 0L)
stop("'maxgap' must be a non-negative integer")
type <- match.arg(type)
select <- match.arg(select)
## merge() also checks that 'query' and 'subject' are based on the
## same reference genome.
seqinfo <- merge(seqinfo(query), seqinfo(subject))
q_len <- length(query)
s_len <- length(subject)
q_seqnames <- seqnames(query)
s_seqnames <- seqnames(subject)
q_seqlevels <- levels(q_seqnames)
s_seqlevels <- levels(s_seqnames)
q_splitranges <- splitRanges(q_seqnames)
s_splitranges <- splitRanges(s_seqnames)
q_ranges <- unname(ranges(query))
s_ranges <- unname(ranges(subject))
if (ignore.strand) {
q_strand <- rep.int(1L, q_len)
s_strand <- rep.int(1L, s_len)
} else {
q_strand <- .strandAsSignedNumber(strand(query))
s_strand <- .strandAsSignedNumber(strand(subject))
}
common_seqlevels <- intersect(q_seqlevels, s_seqlevels)
results <- parallel::mclapply(common_seqlevels,
function(seqlevel)
{
if (isCircular(seqinfo)[seqlevel] %in% TRUE) {
circle.length <- seqlengths(seqinfo)[seqlevel]
} else {
circle.length <- NA
}
q_idx <- q_splitranges[[seqlevel]]
s_idx <- s_splitranges[[seqlevel]]
hits <- .findOverlaps.circle(circle.length,
q_ranges[q_idx],
s_ranges[s_idx],
maxgap, minoverlap, type)
q_hits <- queryHits(hits)
s_hits <- subjectHits(hits)
compatible_strand <- q_strand[q_idx][q_hits] *
s_strand[s_idx][s_hits] != -1L
hits <- hits[compatible_strand]
remapHits(hits, Lnodes.remapping=as.integer(q_idx),
new.nLnode=q_len,
Rnodes.remapping=as.integer(s_idx),
new.nRnode=s_len)
}, mc.cores=mc.cores, mc.preschedule=mc.preschedule)
## Combine the results.
q_hits <- unlist(lapply(results, queryHits))
if (is.null(q_hits))
q_hits <- integer(0)
s_hits <- unlist(lapply(results, subjectHits))
if (is.null(s_hits))
s_hits <- integer(0)
if (select == "arbitrary") {
ans <- rep.int(NA_integer_, q_len)
ans[q_hits] <- s_hits
return(ans)
}
if (select == "first") {
ans <- rep.int(NA_integer_, q_len)
oo <- S4Vectors::orderIntegerPairs(q_hits, s_hits, decreasing=TRUE)
ans[q_hits[oo]] <- s_hits[oo]
return(ans)
}
oo <- S4Vectors::orderIntegerPairs(q_hits, s_hits)
q_hits <- q_hits[oo]
s_hits <- s_hits[oo]
if (select == "last") {
ans <- rep.int(NA_integer_, q_len)
ans[q_hits] <- s_hits
return(ans)
}
new2("Hits", queryHits=q_hits, subjectHits=s_hits,
queryLength=q_len, subjectLength=s_len,
check=FALSE)
}
.exportCoverage <- function(reads, sampleName)
{
coverage_reads = coverage(reads)
export.bedGraph(as(coverage_reads,"GRanges"), paste("coverage_", sampleName, ".bed", sep =""))
}
mapReadsToRestrictionSites <- function(pairedReadsFile, sampleName,BSgenomeName,genome,restrictionSite,enzyme,parallel=F, cores=1)
{
paired_reads_1 <- pairedReadsFile$paired_reads_1
paired_reads_2 <- pairedReadsFile$paired_reads_2
#calculate coverage
all_reads <- c(paired_reads_1, paired_reads_2)
.exportCoverage(all_reads, sampleName)
#take restriction sites
digestFile <- paste("Digest",BSgenomeName,enzyme,sep="_")
if(file.exists(digestFile))
{
load(paste("Digest",BSgenomeName,enzyme,sep="_"))
hindIIIRanges <- resGR
}else
{
hindIIIRanges <- .getRestrictionSitesFromBSgenome(BSgenomeName, genome,restrictionSite,enzyme)
}
#find HindIII sites
if(parallel)
{
requireNamespace("parallel")
hindIII_1 <- .findOverlaps.parallel(paired_reads_1, hindIIIRanges, mc.cores=cores, select="first")
hindIII_2 <- .findOverlaps.parallel(paired_reads_2, hindIIIRanges, mc.cores=cores, select="first")
}else
{
hindIII_1 <- findOverlaps(paired_reads_1, hindIIIRanges, select="first")
hindIII_2 <- findOverlaps(paired_reads_2, hindIIIRanges, select="first")
}
rm(paired_reads_1,paired_reads_2)
#take out reads where we cant assign hindIII site
validInteractions=!is.na(hindIII_1) & !is.na(hindIII_2)
hindIII_1=hindIII_1[validInteractions]
hindIII_2=hindIII_2[validInteractions]
hindIII1=pmin(hindIII_1, hindIII_2)
hindIII2=pmax(hindIII_1, hindIII_2)
#assign hindIII restriction sites to reads
loci1 <- hindIIIRanges[hindIII1]
loci2 <- hindIIIRanges[hindIII2]
interactingLoci <- GRangesList(locus1 = loci1, locus2 = loci2)
outputfilename <- paste("interactingLoci", sampleName, sep = "_")
save(interactingLoci, file=outputfilename)
return(interactingLoci)
}
#Copyright: Robert Sugar (robert.sugar@ebi.ac.uk), EMBL-EBI, 2011
#counts duplicate rows of a data frame
#returns a data frame with unique rows and the frequencies in the new row "frequencies"
#warning: data frame should be ordered!
.countDuplicates <- function(df, frequencyCol = c("frequencies", "frequency"), considerExistingFrequencies = any(frequencyCol %in% names(df)), ordered=FALSE)
{
if(considerExistingFrequencies)
{
frequencyCol <- frequencyCol[frequencyCol %in% names(df)][1]
if(is.null(df[[frequencyCol]]))
stop("frequencies column missing")
} else if(!considerExistingFrequencies)
{
frequencyCol <- frequencyCol[1]
df[, frequencyCol] <- 1 #one for all
}
#order
chrLocusColumns <- c("chr1", "locus1", "chr2", "locus2")
coverageColumns <- c("chr", "locus")
startEndColumns <- c("seqnames1", "start1", "end1", "strand1", "seqnames2", "start2", "end2", "strand2")
if(all(chrLocusColumns %in% names(df)))
{
df <- df[order(df$chr1, df$locus1, df$chr2, df$locus2), ]
} else
if(all(coverageColumns %in% names(df)))
{
df <- df[order(df$chr, df$locus), ]
} else
if(all(startEndColumns %in% names(df)))
{
df <- df[order(df$seqnames1, df$start1, df$end1, df$strand1, df$seqnames2, df$start2, df$end2, df$strand2), ]
} else #use all columns except for frequency
{
df <- df[do.call(order, df[, names(df)[-which(names(df) == frequencyCol)]]), ]
}
#add up the frequencies for the original and all duplicates of a row
duplicates <- c(duplicated(df[, -which(names(df) == frequencyCol)])) #look for duplicates (excluding frequencies column)
cumulative <- c(0, cumsum(df[, frequencyCol])) #the 0 is needed to consider not the first unique but the last duplicate
beforeFirstUnique <- cumulative[!c(duplicates, FALSE)] #the extra FALSE value is needed for the last group
dfUnique <- df[!duplicates, ]
dfUnique[, frequencyCol] = diff(beforeFirstUnique)
return(dfUnique)
}
#####bin interactions into desired bin size e.g. 1Mb########
.binInteractions <- function(interactions, resolution, frequencyCol = "frequencies", considerExistingFrequencies = frequencyCol %in% names(interactions))
{
if(!identical(colnames(interactions)[1:4], c("chr1", "locus1", "chr2", "locus2")))
stop("expecting columns chr1, locus1, chr2, locus2")
if(considerExistingFrequencies && sum(names(interactions) == "frequencies") == 0)
stop(paste("expecting column", frequencyCol))
#if resolution is given, bins will be calculated from interactions using the resolution
interactions$locus1 <- (interactions$locus1 %/% resolution) * resolution
interactions$locus2 <- (interactions$locus2 %/% resolution) * resolution
#put smaller locus first to make sure a-b and b-a interactions look the same
firstSmaller <- subset(interactions, (as.character(chr1) < as.character(chr2)) | (as.character(chr1) == (as.character(chr2)) & locus1 <= locus2))
secondSmaller <- subset(interactions, !((as.character(chr1) < as.character(chr2)) | (as.character(chr1) == (as.character(chr2)) & locus1 <= locus2)))
#flip first and second if second is smaller
cnames <- colnames(interactions)
cnames[1:4] <- c("chr2", "locus2", "chr1", "locus1")
setnames(secondSmaller,colnames(secondSmaller),cnames)
#stich them back together
interactions <- rbind(firstSmaller, secondSmaller)
# --------- count the number of interactions between bins -------
interactions <- interactions[order(interactions$chr1, interactions$locus1, interactions$chr2, interactions$locus2), ]
#bin it
interactions <- .countDuplicates(interactions, frequencyCol, considerExistingFrequencies)
return(interactions)
}
#############functions to import HiCUP filtered reads##########
.fixChromosomeNames <- function(chrnames)
{
#capital to small
chrnames <- paste0("chr", chrnames)
chrnames <- sub("CHR", "chr", chrnames)
chrnames <- sub("chrchr", "chr", chrnames)
}
.importHicup <- function(fileName, checkConsistency=TRUE, fileType=ifelse(grepl("\\.bam$", fileName)|grepl("\\.sam$", fileName), "bam", "table"), compressed=ifelse(grepl("\\.gz$", fileName), TRUE, FALSE))
{
#the output of hicup is a sam file, that looks like uniques_ORIGINALFILE_trunc.sam
#this has to be converted using the hicupToTable tool
if(compressed){
tbl <- read.table(gzfile(fileName))
#in the sam file the two ends of the interactions are consecutive rows
odd <- tbl[seq(1,nrow(tbl),2), ]
names(odd) <- c("id1", "flag1", "chr1", "locus1")
levels(odd$chr1) <- .fixChromosomeNames(levels(odd$chr1))
even <- tbl[seq(2,nrow(tbl),2), ]
names(even) <- c("id2", "flag2", "chr2", "locus2")
levels(even$chr2) <- .fixChromosomeNames(levels(even$chr2))
}else{
if(fileType=="table")
{
tbl <- read.table(fileName)
#in the sam file the two ends of the interactions are consecutive rows
odd <- tbl[seq(1,nrow(tbl),2), ]
names(odd) <- c("id1", "flag1", "chr1", "locus1")
levels(odd$chr1) <- .fixChromosomeNames(levels(odd$chr1))
even <- tbl[seq(2,nrow(tbl),2), ]
names(even) <- c("id2", "flag2", "chr2", "locus2")
levels(even$chr2) <- .fixChromosomeNames(levels(even$chr2))
} else if(fileType=="bam")
{
stop('imporHicup: HiCUP output BAM/SAM file needs to be converted into a table')
}
}
joined <- cbind(odd, even)
if(nrow(odd) != nrow(even)) stop('importHicup: reads must be paired in consecutive rows')
if(checkConsistency)
{
if(!all(joined$id1==joined$id2)) stop('importHicup: reads must be paired in consecutive rows')
}
return(joined[, c("chr1", "locus1", "chr2", "locus2")])
}
.getHindIIIsitesFromHicup <- function(fileName, asRanges = TRUE)
{
sites = read.table(fileName, stringsAsFactors=FALSE, header=TRUE, skip=1)
sites$chr <- sub("^", "chr", sites$Chromosome)
sites$chr <- sub("chrCHR", "chr", sites$chr) #sometimes there is CHR
sites$chr <- sub("chrchr", "chr", sites$chr) #sometimes there is CHR
sites$start <- sites$Fragment_Start_Position
sites$locus <- sites$Fragment_Start_Position
sites$end <- sites$Fragment_End_Position
sites <- sites[, c("chr", "locus", "start", "end")]
if(asRanges)
{
return(GRanges(seqnames=sites$chr, ranges=IRanges(start=sites$start, end=sites$end)))
}
}
.mapHicupToRestrictionFragment <- function(interactions, fragments)
{
source = GRanges(seqnames = Rle(interactions$chr1), ranges = IRanges(interactions$locus1, width = 1),strand = Rle("*", nrow(interactions)))
target = GRanges(seqnames = Rle(interactions$chr2), ranges = IRanges(interactions$locus2, width = 1),strand = Rle("*", nrow(interactions)))
if(is.character(fragments))
{
fragments <- .getHindIIIsitesFromHicup(fragments)
}
firstFragment <- findOverlaps(source, fragments, select="first")
secondFragment <- findOverlaps(target, fragments, select="first")
#filter out non-mapping loci
validInteractions <- !is.na(firstFragment) & !is.na(secondFragment)
firstFragment <- firstFragment[validInteractions]
secondFragment <- secondFragment[validInteractions]
#make sure A-B and B-A are treated together -> put fragment with smaller ID in front
sources <- pmin(firstFragment, secondFragment)
targets <- pmax(firstFragment, secondFragment)
interactions <- as.data.frame(cbind(as.character(seqnames(fragments[sources])), start(ranges(fragments[sources])), as.character(seqnames(fragments[targets])), start(ranges(fragments[targets])), rep(1, times=length(sources))), stringsAsFactors=FALSE)
colnames(interactions) <- c('chr1', 'locus1', 'chr2', 'locus2', 'frequencies')
interactions$locus1=as.numeric(interactions$locus1)
interactions$locus2=as.numeric(interactions$locus2)
interactions$frequencies=as.numeric(interactions$frequencies)
return(interactions)
}
.binomialHiChicup=function(hicupinteraction, restrictionFile, sampleName, cistrans='all', parallel=FALSE, cores=8, removeDiagonal=TRUE)
{
hindGR <- .getHindIIIsitesFromHicup(restrictionFile)
if(parallel)
{
requireNamespace("parallel")
print("running garbage collector before parallel fork")
gc()
}
binned_df_filtered <-hicupinteraction
binned_df_filtered$int1 <-paste(binned_df_filtered$chr1,binned_df_filtered$locus1,sep='_')
binned_df_filtered$int2 <-paste(binned_df_filtered$chr2,binned_df_filtered$locus2,sep='_')
#diagonal removal
if(removeDiagonal)
{
binned_df_filtered <- binned_df_filtered[binned_df_filtered$int1!=binned_df_filtered$int2,]
}
if(cistrans=='cis'){
binned_df_filtered <- binned_df_filtered[binned_df_filtered$chr1==binned_df_filtered$chr2,]
}
if(cistrans=='trans'){
binned_df_filtered <- binned_df_filtered[binned_df_filtered$chr1!=binned_df_filtered$chr2,]
}
#all read pairs used in binomial
numberOfReadPairs <- sum(binned_df_filtered$frequencies)
#calculate coverage
all_bins <- unique(c(unique(binned_df_filtered$int1), unique(binned_df_filtered$int2)))
all_bins <- sort(all_bins)
if(nrow(binned_df_filtered)>1e8)
{
t <- ceiling(nrow(binned_df_filtered)/1e8)
dfList <- list()
dfList[[1]] <- binned_df_filtered[1:1e8,]
for(i in 2:t){
dfList[[i]] <- binned_df_filtered[(((i-1)*1e8)+1):min((i*1e8),nrow(binned_df_filtered)),]
}
dtList <- lapply(dfList, data.table)
covAs <- lapply(dtList, function(x) x[,sum(frequencies), by=int1])
covBs <- lapply(dtList, function(x) x[,sum(frequencies), by=int2])
covAm <- do.call(rbind, covAs)
covBm <- do.call(rbind, covBs)
covA <- covAm[,sum(V1),by=int1]
covB <- covBm[,sum(V1),by=int2]
}else{
binned_dt=data.table(binned_df_filtered)
covA <- binned_dt[,sum(frequencies),by=int1]
covB <- binned_dt[,sum(frequencies),by=int2]
}
covA <- setkey(covA,key='int1')
setnames(covB, 1,'int1')
covB <- setkey(covB,key='int1')
cov=merge(covA,covB,all.x=TRUE,all.y=TRUE,by='int1')
cov$V1.x[is.na(cov$V1.x)]=0
cov$V1.y[is.na(cov$V1.y)]=0
cov$coverage=cov$V1.x+cov$V1.y
coverage=cov$coverage
names(coverage)=cov$int1
sumcov <- sum(coverage)
relative_coverage <- coverage/sumcov
names(relative_coverage)=names(coverage)
binned_df_filtered$coverage_source <- relative_coverage[binned_df_filtered$int1]
binned_df_filtered$coverage_target <- relative_coverage[binned_df_filtered$int2]
#probability correction assuming on average equal probabilities for all interactions
numberOfAllInteractions <- length(all_bins)^2
upperhalfBinNumber <- (length(all_bins)^2-length(all_bins))/2
if(cistrans!='all'){
chromos <- unique(binned_df_filtered$chr1)
chrlens <- c()
for(cr in chromos){
chrlens[cr] <- length(unique(c(unique(binned_df_filtered$locus1[binned_df_filtered$chr1==cr]),unique(binned_df_filtered$locus2[binned_df_filtered$chr2==cr]))))
}
cisBinNumber <-(sum(chrlens^2)-length(all_bins))/2
transBinNumber <- upperhalfBinNumber-cisBinNumber
}
diagonalProb <- sum(relative_coverage^2)
if(cistrans=='all'){
probabilityCorrection <- if(removeDiagonal){1/(1-diagonalProb)}else{1}
}
if(cistrans=='cis'){
probabilityCorrection <- upperhalfBinNumber/cisBinNumber
}
if(cistrans=='trans'){
probabilityCorrection <- upperhalfBinNumber/transBinNumber
}
binned_df_filtered$probabilityOfInteraction <- binned_df_filtered$coverage_source*binned_df_filtered$coverage_target*2*probabilityCorrection
binned_df_filtered$predicted <- binned_df_filtered$probabilityOfInteraction * numberOfReadPairs
if(parallel)
{
requireNamespace("parallel")
if(nrow(binned_df_filtered)>1e8)
{
t <- ceiling(nrow(binned_df_filtered)/1e8)
dfList <- list()
dfList[[1]] <- binned_df_filtered[1:1e8,]
for(i in 2:t){
dfList[[i]] <- binned_df_filtered[(((i-1)*1e8)+1):min((i*1e8),nrow(binned_df_filtered)),]
}
dtList <- lapply(dfList, function(x) as.data.frame(t(cbind(as.numeric(x[["frequencies"]]),
as.numeric(x[["probabilityOfInteraction"]])))))
pvalues=list()
for(i in 1:length(dtList)){
pvalues[[i]] <-unlist(parallel::mclapply(dtList[[i]], function(x)
{
binom.test(x[1], numberOfReadPairs, x[2], alternative = "greater")$p.value
},
mc.cores=cores))
}
pvals=unlist(pvalues)
binned_df_filtered$pvalue <- pvals
}else{
binomParams <- as.data.frame(t(cbind(
as.numeric(binned_df_filtered[["frequencies"]]),
as.numeric(binned_df_filtered[["probabilityOfInteraction"]]
))))
binned_df_filtered$pvalue <- unlist(parallel::mclapply(binomParams, function(x)
{
binom.test(x[1], numberOfReadPairs, x[2], alternative = "greater")$p.value
},
mc.cores=cores))
}
}else{
if(nrow(binned_df_filtered)>1e8)
{
t <- ceiling(nrow(binned_df_filtered)/1e8)
dfList <- list()
dfList[[1]] <- binned_df_filtered[1:1e8,]
for(i in 2:t){
dfList[[i]] <- binned_df_filtered[(((i-1)*1e8)+1):min((i*1e8),nrow(binned_df_filtered)),]
}
pvalues=list()
for(i in 1:length(dfList)){
pvalues[[i]] <-apply(dfList[[i]], 1, function(x)
{
binom.test(as.numeric(x[["frequencies"]]), numberOfReadPairs, as.numeric(x[["probabilityOfInteraction"]]), alternative = "greater")$p.value
}
)
}
pvals=unlist(pvalues)
binned_df_filtered$pvalue <- pvals
}else{
binned_df_filtered$pvalue <- apply(binned_df_filtered, 1, function(x)
{
binom.test(as.numeric(x[["frequencies"]]), numberOfReadPairs, as.numeric(x[["probabilityOfInteraction"]]), alternative = "greater")$p.value
}
)
}
}
#observed over expected log ratio
binned_df_filtered$logFoldChange <- log2(binned_df_filtered$frequencies/binned_df_filtered$predicted)
#multiple testing correction separately for matrices with all interactions/only cis/only transs
if(cistrans=='all'){
binned_df_filtered$qvalue <- if(removeDiagonal){p.adjust(binned_df_filtered$pvalue, method = "BH", n=upperhalfBinNumber)}else{p.adjust(binned_df_filtered$pvalue, method = "BH", n=upperhalfBinNumber+length(all_bins))}
}
if(cistrans=='cis'){
binned_df_filtered$qvalue <- if(removeDiagonal){p.adjust(binned_df_filtered$pvalue, method = "BH", n=cisBinNumber)}else{p.adjust(binned_df_filtered$pvalue, method = "BH", n=cisBinNumber+length(all_bins))}
}
if(cistrans=='trans'){
binned_df_filtered$qvalue <- p.adjust(binned_df_filtered$pvalue, method = "BH", n=transBinNumber)
}
test <- data.frame(binned_df_filtered)
test[,"pvalue"] <- test$pvalue
pval.plot <- ggplot(test,aes(x=pvalue))
tryCatch(
{
x11()
print(pval.plot + geom_density())
},
error=function(cond) {
message("No interactive plot, try saving image")
message(cond)
return(tryCatch(
{
pdf(file=paste(sampleName,"pvalue_distribution.pdf",sep="_"))
print(pval.plot + geom_density())
dev.off()
},
error=function(cond2) {
message("No pdf output, quality assesment plot is not produced")
message(cond2)
return(NA)
},
warning=function(cond2) {
message("No pdf output, quality assesment plot is not produced")
message(cond2)
return(NA)
}
))
},
warning=function(cond) {
message("No interactive plot, try saving image")
message(cond)
return(tryCatch(
{
pdf(file=paste(sampleName,"pvalue_distribution.pdf",sep="_"))
print(pval.plot + geom_density())
dev.off()
},
error=function(cond2) {
message("No pdf output, quality assesment plot is not produced")
message(cond2)
return(NA)
},
warning=function(cond2) {
message("No pdf output, quality assesment plot is not produced")
message(cond2)
return(NA)
}
))
})
binned_df_filtered=binned_df_filtered[,c('chr1','locus1','chr2','locus2','coverage_source','coverage_target','probabilityOfInteraction', 'predicted','frequencies', 'pvalue','qvalue','logFoldChange')]
colnames(binned_df_filtered)=c('chr1','locus1','chr2','locus2','relCoverage1','relCoverage2','probability', 'expected','readCount', 'pvalue','qvalue','logObservedOverExpected')
return(binned_df_filtered)
}
# Author: borbalagerle
###############################################################################
# take GenomicRangesList with interactions in locus1 and locus2
.binomialHiC=function(interactingLoci, res,sampleName,BSgenomeName, genome, restrictionSite, enzyme, parallel=F, cores=NULL,removeDiagonal=TRUE,cistrans='all',filterdist=10000)
{
#take restriction sites
digestFile <- paste("Digest",BSgenomeName,enzyme,sep="_")
if(file.exists(digestFile))
{
load(paste("Digest",BSgenomeName,enzyme,sep="_"))
hindIIIRanges <- resGR
}else
{
hindIIIRanges <- .getRestrictionSitesFromBSgenome(BSgenomeName, genome, restrictionSite, enzyme)
}
elementMetadata(hindIIIRanges)$score=1:length(hindIIIRanges)
#remove self-ligations and interactions between adjacent bins by assigning the fragment number to reads and remove those where the difference is smaller or equal to 1
x=findOverlaps(interactingLoci[[1]],hindIIIRanges,select='first')
y=findOverlaps(interactingLoci[[2]],hindIIIRanges,select='first')
interactingLoci1 <- interactingLoci[[1]]
interactingLoci2 <- interactingLoci[[2]]
interactingLoci1$score=hindIIIRanges$score[x]
interactingLoci2$score=hindIIIRanges$score[y]
interactingLoci <- GRangesList(locus1=interactingLoci1,locus2=interactingLoci2)
score=interactingLoci[[2]]$score-interactingLoci[[1]]$score
interactingLoci[[1]]=interactingLoci[[1]][abs(score)>1]
interactingLoci[[2]]=interactingLoci[[2]][abs(score)>1]
#filter for reads that are closer than 10kb to get rid of incomplete digest products
# df_int <- data.frame(as.vector(seqnames(interactingLoci[[1]])), start(ranges(interactingLoci[[1]])), as.vector(seqnames(interactingLoci[[2]])), start(ranges(interactingLoci[[2]])))
df_int <- data.frame(as.character(seqnames(interactingLoci[[1]])), start(ranges(interactingLoci[[1]])), as.character(seqnames(interactingLoci[[2]])), start(ranges(interactingLoci[[2]])), stringsAsFactors=FALSE)
colnames(df_int) <- c("chr1", "locus1", "chr2", "locus2")
df_filtered <- df_int
df_filtered$dist <-as.vector(ifelse(df_filtered$chr1 == df_filtered$chr2, abs(df_filtered$locus1 - df_filtered$locus2), Inf))
df_filtered <-df_filtered[df_filtered$dist>filterdist,]
df_filtered <-df_filtered[,1:4]
#bin interactions according to resolution
binned_df_filtered <- .binInteractions(df_filtered, res)
binned_df_filtered$int1 <-paste(binned_df_filtered$chr1,binned_df_filtered$locus1,sep='_')
binned_df_filtered$int2 <-paste(binned_df_filtered$chr2,binned_df_filtered$locus2,sep='_')
#diagonal removal - interactions within the same bin
if(removeDiagonal)
{
subs <-which(binned_df_filtered$int1!=binned_df_filtered$int2)
binned_df_filtered <- binned_df_filtered[subs,]
}
if(cistrans=='cis'){
subs <-which(binned_df_filtered$chr1==binned_df_filtered$chr2)
binned_df_filtered <- binned_df_filtered[subs,]
}
if(cistrans=='trans'){
subs <-which(binned_df_filtered$chr1!=binned_df_filtered$chr2)
binned_df_filtered <- binned_df_filtered[subs,]
}
#all read pairs used in binomial
numberOfReadPairs <- sum(binned_df_filtered$frequencies)
#calculate coverage
all_bins <- unique(c(unique(binned_df_filtered$int1), unique(binned_df_filtered$int2)))
all_bins <- sort(all_bins)
if(nrow(binned_df_filtered)>1e8)
{
t <- ceiling(nrow(binned_df_filtered)/1e8)
dfList <- list()
dfList[[1]] <- binned_df_filtered[1:1e8,]
for(i in 2:t){
dfList[[i]] <- binned_df_filtered[(((i-1)*1e8)+1):min((i*1e8),nrow(binned_df_filtered)),]
}
dtList <- lapply(dfList, data.table)
covAs <- lapply(dtList, function(x) x[,sum(frequencies), by=int1])
covBs <- lapply(dtList, function(x) x[,sum(frequencies), by=int2])
covAm <- do.call(rbind, covAs)
covBm <- do.call(rbind, covBs)
covA <- covAm[,sum(V1),by=int1]
covB <- covBm[,sum(V1),by=int2]
}else{
binned_dt=data.table(binned_df_filtered)
covA <- binned_dt[,sum(frequencies),by=int1]
covB <- binned_dt[,sum(frequencies),by=int2]
}
covA <- setkey(covA,key='int1')
setnames(covB, 1,'int1')
covB <- setkey(covB,key='int1')
cov=merge(covA,covB,all.x=TRUE,all.y=TRUE,by='int1')
cov$V1.x[is.na(cov$V1.x)]=0
cov$V1.y[is.na(cov$V1.y)]=0
cov$coverage=cov$V1.x+cov$V1.y
coverage=cov$coverage
names(coverage)=cov$int1
sumcov <- sum(coverage)
relative_coverage <- coverage/sumcov
names(relative_coverage)=names(coverage)
binned_df_filtered$cov1 <- relative_coverage[binned_df_filtered$int1]
binned_df_filtered$cov2 <- relative_coverage[binned_df_filtered$int2]
#probability correction assuming on average equal probabilities for all interactions
numberOfAllInteractions <- length(all_bins)^2
upperhalfBinNumber <- (length(all_bins)^2-length(all_bins))/2
chromos <- unique(binned_df_filtered$chr1)
chrlens <- c()
for(cr in chromos){
chrlens[cr] <- length(unique(c(unique(binned_df_filtered$locus1[binned_df_filtered$chr1==cr]),unique(binned_df_filtered$locus2[binned_df_filtered$chr2==cr]))))
}
cisBinNumber <-(sum(chrlens^2)-length(all_bins))/2
transBinNumber <- upperhalfBinNumber-cisBinNumber
diagonalProb <- sum(relative_coverage^2)
if(cistrans=='all'){
probabilityCorrection <- if(removeDiagonal){1/(1-diagonalProb)}else{1}
}
if(cistrans=='cis'){
probabilityCorrection <- upperhalfBinNumber/cisBinNumber
}
if(cistrans=='trans'){
probabilityCorrection <- upperhalfBinNumber/transBinNumber
}
#calculate probability of random interaction as a product of the relative coverages of individual bins*2
binned_df_filtered$probability <- binned_df_filtered$cov1*binned_df_filtered$cov2*2*probabilityCorrection
#number of expected reads by chance
binned_df_filtered$predicted <- binned_df_filtered$probability * numberOfReadPairs
#calculate cumulative binomial test for observed number of reads given the probability of seeing a random read
if(parallel)
{
requireNamespace("parallel")
if(nrow(binned_df_filtered)>1e8)
{
t <- ceiling(nrow(binned_df_filtered)/1e8)
dfList <- list()
dfList[[1]] <- binned_df_filtered[1:1e8,]
for(i in 2:t){
dfList[[i]] <- binned_df_filtered[(((i-1)*1e8)+1):min((i*1e8),nrow(binned_df_filtered)),]
}
dtList <- lapply(dfList, function(x) as.data.frame(t(cbind(as.numeric(x[["frequencies"]]),
as.numeric(x[["probability"]])))))
pvalues=list()
for(i in 1:length(dtList)){
pvalues[[i]] <-unlist(parallel::mclapply(dtList[[i]], function(x)
{
binom.test(x[1], numberOfReadPairs, x[2], alternative = "greater")$p.value
},
mc.cores=cores))
}
pvals=unlist(pvalues)
binned_df_filtered$pvalue <- pvals
}else{
binomParams <- as.data.frame(t(cbind(
as.numeric(binned_df_filtered[["frequencies"]]),
as.numeric(binned_df_filtered[["probability"]]
))))
binned_df_filtered$pvalue <- unlist(parallel::mclapply(binomParams, function(x)
{
binom.test(x[1], numberOfReadPairs, x[2], alternative = "greater")$p.value
},
mc.cores=cores))
}
}else{
if(nrow(binned_df_filtered)>1e8)
{
t <- ceiling(nrow(binned_df_filtered)/1e8)
dfList <- list()
dfList[[1]] <- binned_df_filtered[1:1e8,]
for(i in 2:t){
dfList[[i]] <- binned_df_filtered[(((i-1)*1e8)+1):min((i*1e8),nrow(binned_df_filtered)),]
}
pvalues=list()
for(i in 1:length(dfList)){
pvalues[[i]] <-apply(dfList[[i]], 1, function(x)
{
binom.test(as.numeric(x[["frequencies"]]), numberOfReadPairs, as.numeric(x[["probability"]]), alternative = "greater")$p.value
}
)
}
pvals=unlist(pvalues)
binned_df_filtered$pvalue <- pvals
}else{
binned_df_filtered$pvalue <- apply(binned_df_filtered, 1, function(x)
{
binom.test(as.numeric(x[["frequencies"]]), numberOfReadPairs, as.numeric(x[["probability"]]), alternative = "greater")$p.value
}
)
}
}
#observed over expected log ratio
binned_df_filtered$logFoldChange <- log2(binned_df_filtered$frequencies/binned_df_filtered$predicted)
#multiple testing correction separately for matrices with all interactions/only cis/only transs
if(cistrans=='all'){
binned_df_filtered$qvalue <- if(removeDiagonal){p.adjust(binned_df_filtered$pvalue, method = "BH", n=upperhalfBinNumber)}else{p.adjust(binned_df_filtered$pvalue, method = "BH", n=upperhalfBinNumber+length(all_bins))}
}
if(cistrans=='cis'){
binned_df_filtered$qvalue <- if(removeDiagonal){p.adjust(binned_df_filtered$pvalue, method = "BH", n=cisBinNumber)}else{p.adjust(binned_df_filtered$pvalue, method = "BH", n=cisBinNumber+length(all_bins))}
}
if(cistrans=='trans'){
binned_df_filtered$qvalue <- p.adjust(binned_df_filtered$pvalue, method = "BH", n=transBinNumber)
}
test <- data.frame(binned_df_filtered)
test[,"pvalue"] <- test$pvalue
pval.plot <- ggplot(test,aes(x=pvalue))
tryCatch(
{
x11()
print(pval.plot + geom_density())
},
error=function(cond) {
message("No interactive plot, try saving image")
message(cond)
return(tryCatch(
{
pdf(file=paste(sampleName,"pvalue_distribution.pdf",sep="_"))
print(pval.plot + geom_density())
dev.off()
},
error=function(cond2) {
message("No pdf output, quality assesment plot is not produced")
message(cond2)
return(NA)
},
warning=function(cond2) {
message("No pdf output, quality assesment plot is not produced")
message(cond2)
return(NA)
}
))
},
warning=function(cond) {
message("No interactive plot, try saving image")
message(cond)
return(tryCatch(
{
pdf(file=paste(sampleName,"pvalue_distribution.pdf",sep="_"))
print(pval.plot + geom_density())
dev.off()
},
error=function(cond2) {
message("No pdf output, quality assesment plot is not produced")
message(cond2)
return(NA)
},
warning=function(cond2) {
message("No pdf output, quality assesment plot is not produced")
message(cond2)
return(NA)
}
))
})
binned_df_filtered=binned_df_filtered[,c('chr1','locus1','chr2','locus2','cov1','cov2','probability', 'predicted','frequencies', 'pvalue','qvalue','logFoldChange')]
colnames(binned_df_filtered)=c('chr1','locus1','chr2','locus2','relCoverage1','relCoverage2','probability', 'expected','readCount', 'pvalue','qvalue','logObservedOverExpected')
return(binned_df_filtered)
}
####################################################################################################################################
####################################################################################################################################
## run GOTHiC
GOTHiC <- function(fileName1, fileName2, sampleName, res, BSgenomeName='BSgenome.Hsapiens.UCSC.hg19', genome=BSgenome.Hsapiens.UCSC.hg19, restrictionSite='A^AGCTT', enzyme='HindIII',cistrans='all',filterdist=10000, DUPLICATETHRESHOLD=1, fileType='BAM', parallel=FALSE, cores=NULL){
if(file.exists(paste(sampleName,"paired_filtered",sep="_"))){
message("Loading paired reads file ...")
load(paste(sampleName,"paired_filtered",sep="_"))
pairedReadsFile <- filtered
}else{
message("Pairing reads")
pairedReadsFile <- pairReads(fileName1, fileName2, sampleName, DUPLICATETHRESHOLD=1, fileType=fileType)
}
if (file.exists(paste("interactingLoci",sampleName,sep="_"))){
message("Loading mapped reads file ...")
load(paste("interactingLoci",sampleName,sep="_"))
interactions <- interactingLoci
}else{
message("Mapping reads to restriction sites")
interactions <- mapReadsToRestrictionSites(pairedReadsFile, sampleName, BSgenomeName, genome, restrictionSite, enzyme, parallel, cores)
}
message("Computing binomial ...")
binom <- .binomialHiC(interactions, res,sampleName, BSgenomeName, genome, restrictionSite, enzyme, parallel, cores,cistrans=cistrans,filterdist=filterdist)
return(binom)
}
GOTHiChicup <-function(fileName, sampleName, res, restrictionFile, cistrans='all', parallel=FALSE, cores=NULL)
{
interactions <- .importHicup(fileName, checkConsistency=TRUE, fileType=ifelse(grepl("\\.bam$", fileName)|grepl("\\.sam$", fileName), "bam", "table"))
interactions <- .mapHicupToRestrictionFragment(interactions, restrictionFile)
interactions <- .binInteractions(interactions, res)
binom <- .binomialHiChicup(interactions, restrictionFile, sampleName, cistrans, parallel, cores)
return(binom)
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.