Nothing
#' @include union.R
NULL
#' Add meta data column to anchors based on GRanges
#'
#' \code{annotateAnchors} adds a logical variable to meta data columns in the
#' anchors based on a GRanges object of features' genomic coordinates
#'
#' This function adds column of TRUE/FALSE values on the loops object
#' anchors whether a feature is observed nearby in \code{features}. The name
#' of this column that will be in the anchors GRanges object is specified by
#' a user defined string \code{featureName}. Gap tolerance between a feature
#' and an anchor is specified by \code{maxgap}, where the default is 1,000bp.
#'
#' @param dlo A loops object whose anchors will be annotated
#' @param features A Granges object corresponding to locations of interest
#' @param featureName A string that will be the mcol name in anchors
#' @param maxgap A value of max permissible gap between a feature and anchor
#'
#' @return A loops object with new meta data column in anchors
#'
#' @examples
#' # Annotate whether anchors are near a gene body; within 1kb
#' rda<-paste(system.file('rda',package='diffloop'),'loops.small.rda',sep='/')
#' load(rda)
#' gb <-getHumanGenes()
#' loops.small <- annotateAnchors(loops.small,gb,'nearGeneBody')
#'
#' # Adding close to gene bodies with no gap tolerance
#' loops.small <- annotateAnchors(loops.small,gb,'inGeneBody',0)
#'
#' @import GenomicRanges
#' @export
setGeneric(name = "annotateAnchors", def = function(dlo, features,
featureName, maxgap) standardGeneric("annotateAnchors"))
.annotateAnchors <- function(dlo, features, featureName, maxgap) {
hits <- suppressWarnings(findOverlaps(features, dlo@anchors,
maxgap = maxgap))
idx <- unique(subjectHits(hits))
values <- data.frame(matrix(FALSE, ncol = 1, nrow = length(ranges(dlo@anchors))))
values[idx, ] <- TRUE
colnames(values) <- featureName
mcols(dlo@anchors) <- c(mcols(dlo@anchors), values)
return(dlo)
}
#' @rdname annotateAnchors
setMethod(f = "annotateAnchors", signature = c("loops", "GRanges",
"character", "missing"), definition = function(dlo, features,
featureName, maxgap = 1000) {
maxgap <- 1000
.annotateAnchors(dlo, features, featureName, maxgap)
})
#' @rdname annotateAnchors
setMethod(f = "annotateAnchors", signature = c("loops", "GRanges",
"character", "numeric"), definition = function(dlo, features,
featureName, maxgap) {
.annotateAnchors(dlo, features, featureName, maxgap)
})
#' Add meta data column to anchors based on .bigwig
#'
#' \code{annotateAnchors.bigwig} adds a numeric variable to meta data
#' columns in the anchors slot based on a user-specified .bigwig
#' file.
#'
#' This function adds a meta data column to anchors of the specified
#' loops object. All values from the .bigwig file that overlap with the
#' each anchor are handled by the FUN (default is to average them) to
#' produce a single value added to the mcols of the anchors.
#'
#' @param dlo A loops object whose anchors will be annotated
#' @param file A file corresponding to the bigwig of interest
#' @param FUN A function used to combine multiple values observed in a single anchor; default is mean
#' @param pad An integer value of to pad the anchors of the loops object; default is 0
#'
#' @return A loops object with new numeric meta data column in anchors
#'
#' @examples
#' # Annotate whether anchors are near a gene body; within 1kb
#' rda<-paste(system.file('rda',package='diffloop'),'loops.small.rda',sep='/')
#' load(rda)
#' gb <-getHumanGenes()
#' loops.small <- annotateAnchors(loops.small,gb,'nearGeneBody')
#'
#' @import GenomicRanges
#' @importFrom rtracklayer import.bw
#' @importFrom tools file_path_sans_ext
#'
#' @export
setGeneric(name = "annotateAnchors.bigwig", function(dlo, file, FUN = mean, pad = 0)
standardGeneric("annotateAnchors.bigwig"))
#' @rdname annotateAnchors.bigwig
setMethod(f = "annotateAnchors.bigwig", definition = function(dlo, file, FUN = mean, pad = 0) {
sample <- basename(file_path_sans_ext(file))
bw.vals <- import.bw(file, which = addchr(dlo@anchors))
ovl.k <- findOverlaps(addchr(dlo@anchors), bw.vals)
qh.k <- queryHits(ovl.k) # anchors
sh.k <- subjectHits(ovl.k)
values.t <- as.data.frame(tapply(mcols(bw.vals[sh.k])$score, qh.k, FUN))
#A lot of extra effort to handle anchor regions with no values
colnames(values.t) <- "bwvalues"
vNA <- as.character(1:length(ranges(dlo@anchors)))
Missing <- setdiff(vNA, rownames(values.t))
m.df <- data.frame(m.vals = matrix(NA, nrow = length(Missing), ncol = 1))
row.names(m.df) <- Missing
colnames(m.df) <- "bwvalues"
total <- rbind(values.t, m.df)
values <- as.vector(total[order(as.numeric(row.names(total))),, drop = FALSE])
colnames(values) <- sample
mcols(dlo@anchors) <- as.data.frame(c(mcols(dlo@anchors), values))
return(dlo)
})
#' Add meta data column to anchors based on bedgraph scores
#'
#' \code{annotateAnchors.bed} adds a numeric variable to meta data
#' columns in the anchors slot based on a user-specified .bed file
#' where the fourth column is a numeric score.
#'
#' This function adds a meta data column to anchors of the specified
#' loops object. All values from the .bed file that overlap with the
#' each anchor are handled by the FUN (default is to average them) to
#' produce a single value added to the mcols of the anchors.
#'
#' @param dlo A loops object whose anchors will be annotated
#' @param file A string pointing to the bed file of interest
#' @param FUN A function used to combine multiple values observed in a single anchor; default is mean
#' @param pad An integer value of to pad the anchors of the loops object; default is 0
#'
#' @return A loops object with new numeric meta data column in anchors
#'
#' @examples
#' # Annotate whether anchors are near a gene body; within 1kb
#' rda<-paste(system.file('rda',package='diffloop'),'loops.small.rda',sep='/')
#' load(rda)
#' gb <-getHumanGenes()
#' loops.small <- annotateAnchors(loops.small,gb,'nearGeneBody')
#'
#' @import GenomicRanges
#' @importFrom tools file_path_sans_ext
#'
#' @export
setGeneric(name = "annotateAnchors.bed", function(dlo, file, FUN = mean, pad = 0)
standardGeneric("annotateAnchors.bed"))
#' @rdname annotateAnchors.bed
setMethod(f = "annotateAnchors.bed", definition = function(dlo, file, FUN = mean, pad = 0) {
sample <- basename(file_path_sans_ext(file))
bed.vals <- GRanges(setNames(read.table(file, header = TRUE), c("chr", "start", "stop", "score")))
ovl.k <- findOverlaps(dlo@anchors, rmchr(bed.vals))
qh.k <- queryHits(ovl.k) # anchors
sh.k <- subjectHits(ovl.k)
values.t <- as.data.frame(tapply(mcols(bed.vals[sh.k])$score, qh.k, FUN))
#A lot of extra effort to handle anchor regions with no values
colnames(values.t) <- "bwvalues"
vNA <- as.character(1:length(ranges(dlo@anchors)))
Missing <- setdiff(vNA, rownames(values.t))
m.df <- data.frame(m.vals = matrix(NA, nrow = length(Missing), ncol = 1))
row.names(m.df) <- Missing
colnames(m.df) <- "bwvalues"
total <- rbind(values.t, m.df)
values <- as.vector(total[order(as.numeric(row.names(total))),, drop = FALSE])
colnames(values) <- sample
mcols(dlo@anchors) <- as.data.frame(c(mcols(dlo@anchors), values))
return(dlo)
})
#' Get protein coding gene regions
#'
#' \code{getHumanGenes} returns a \code{GRanges} object of all protein
#' coding genes genome-wide or within specified chromosomes. Annotation
#' is from regions from hg19/Gr37 and protein coding genes.
#'
#' This function returns a \code{GRanges} object with the coordinates and
#' gene IDs of all protein coding genes either genome-wide
#' (by default) orspecified within a particular chromosome.
#'
#' @param chr A vector of chromosomes
#' @param cache logic variable (default = TRUE) to use genes from July.2015 freeze
#'
#' @return A GRanges object
#'
#' @examples
#' # Grab all protein coding gene locations genome-wide
#' pc.genes <- getHumanGenes()
#' # Grab all protein coding gene loctions on chromosome 1
#' chr1 <- getHumanGenes(c('1'))
#' @import GenomicRanges
#' @import biomaRt
#' @importFrom GenomeInfoDb sortSeqlevels seqlevels seqlevels<-
#' @importFrom S4Vectors queryHits subjectHits
#'
#' @export
setGeneric(name = "getHumanGenes", def = function(chr, cache = TRUE) standardGeneric("getHumanGenes"))
#' @import GenomicRanges
.getHumanGenesNoCache <- function(chr) {
vals = list(chr, "protein_coding")
mart = useMart(biomart="ENSEMBL_MART_ENSEMBL", host="grch37.ensembl.org",
path="/biomart/martservice" ,dataset="hsapiens_gene_ensembl")
geneinfo = getBM(attributes = c("chromosome_name", "start_position",
"end_position", "external_gene_name"), filters = c("chromosome_name",
"biotype"), values = vals, mart = mart)
colnames(geneinfo) <- c("chr", "start", "end", "id")
raw <- makeGRangesFromDataFrame(geneinfo, keep.extra.columns = TRUE,
ignore.strand = TRUE, seqnames.field = c("chr"), start.field = "start",
end.field = c("end"), starts.in.df.are.0based = FALSE)
gr <- sortSeqlevels(raw)
gr <- sort(gr)
return(gr)
}
#' @rdname getHumanGenes
setMethod(f = "getHumanGenes", signature = c("missing", "ANY"),
definition = function(chr, cache = TRUE) {
all <- c("1", "2", "3", "4", "5", "6", "7", "8", "9",
"10", "11", "12", "13", "14", "15", "16", "17", "18",
"19", "20", "21", "22", "X", "Y")
if (cache) {
human.genes <- NULL
rda <- paste(system.file("rda", package = "diffloop"),
"human.genes.rda", sep = "/")
load(rda)
return(human.genes[as.vector(as.data.frame(human.genes)$seqnames) %in%
as.vector(all)])
} else {
return(.getHumanGenesNoCache(all))
}
})
#' @rdname getHumanGenes
setMethod(f = "getHumanGenes", signature = c("character", "ANY"),
definition = function(chr, cache = TRUE) {
if (cache) {
human.genes = NULL
rda <- paste(system.file("rda", package = "diffloop"),
"human.genes.rda", sep = "/")
load(rda)
return(human.genes[is.element(as.vector(as.data.frame(human.genes)$seqnames),
as.vector(chr))])
} else {
return(.getHumanGenesNoCache(chr))
}
})
#' Get Human Transcription Start Sites
#'
#' \code{getHumanTSS} returns a \code{GRanges} object of all
#' transcription start sites for humans. Regions from hg19/Gr37 for
#' protein coding regions.
#'
#' This function returns a \code{GRanges} object with the coordinates and
#' gene TSS. The start and end of the IRanges slot will be the same number,
#' so consider using the \code{padGRanges} function after calling this function.
#'
#' @param chr Specifies what chromosomes are desired for the TSS#'
#' @return A GRanges object
#'
#' @examples
#' # Grab all transition start sites genome-wide
#' human.TSS <- getHumanTSS()
#' @import GenomicRanges
#' @import biomaRt
#' @importFrom GenomeInfoDb sortSeqlevels seqlevels seqlevels<-
#' @importFrom S4Vectors queryHits subjectHits
#'
#' @export
setGeneric(name = "getHumanTSS", def = function(chr) standardGeneric("getHumanTSS"))
#' @rdname getHumanTSS
setMethod(f = "getHumanTSS", signature = c("missing"),
definition = function(chr) {
all <- c("1", "2", "3", "4", "5", "6", "7", "8", "9",
"10", "11", "12", "13", "14", "15", "16", "17", "18",
"19", "20", "21", "22", "X", "Y")
geneinfo = NULL
rda <- paste(system.file("rda", package = "diffloop"),
"geneinfo.h.rda", sep = "/")
load(rda)
human.TSS <- GRanges(geneinfo[,c(1,2,3,4)])
end(human.TSS[geneinfo$strand == 1]) <- start(human.TSS[geneinfo$strand == 1])
start(human.TSS[geneinfo$strand == -1]) <- end(human.TSS[geneinfo$strand == -1])
return(human.TSS[as.vector(as.data.frame(human.TSS)$seqnames) %in% as.vector(all)])
})
#' @rdname getHumanTSS
setMethod(f = "getHumanTSS", signature = c("character"),
definition = function(chr) {
geneinfo = NULL
rda <- paste(system.file("rda", package = "diffloop"),
"geneinfo.h.rda", sep = "/")
load(rda)
human.TSS <- GRanges(geneinfo[,c(1,2,3,4)])
end(human.TSS[geneinfo$strand == 1]) <- start(human.TSS[geneinfo$strand == 1])
start(human.TSS[geneinfo$strand == -1]) <- end(human.TSS[geneinfo$strand == -1])
return(human.TSS[is.element(as.vector(as.data.frame(human.TSS)$seqnames), as.vector(chr))])
})
#' Get Mouse Transcription Start Sites
#'
#' \code{getMouseTSS} returns a \code{GRanges} object of all
#' transcription start sites for humans. Regions from mm9 for
#' protein coding regions.
#'
#' This function returns a \code{GRanges} object with the coordinates and
#' gene TSS. The start and end of the IRanges slot will be the same number,
#' so consider using the \code{padGRanges} function after calling this function.
#'
#' @param chr Specifies what chromosomes are desired for the TSS#
#' @return A GRanges object
#'
#' @examples
#' # Grab all transition start sites genome-wide
#' mouse.TSS <- getMouseTSS()
#' @import GenomicRanges
#' @import biomaRt
#' @importFrom GenomeInfoDb sortSeqlevels seqlevels seqlevels<-
#' @importFrom S4Vectors queryHits subjectHits
#'
#' @export
setGeneric(name = "getMouseTSS", def = function(chr) standardGeneric("getMouseTSS"))
#' @rdname getMouseTSS
setMethod(f = "getMouseTSS", signature = c("missing"),
definition = function(chr) {
all <- c("1", "2", "3", "4", "5", "6", "7", "8", "9",
"10", "11", "12", "13", "14", "15", "16", "17", "18",
"19", "X", "Y")
geneinfo = NULL
rda <- paste(system.file("rda", package = "diffloop"),
"geneinfo.m.rda", sep = "/")
load(rda)
mouseTSS <- GRanges(geneinfo[,c(1,2,3,4)])
end(mouseTSS[geneinfo$strand == 1]) <- start(mouseTSS[geneinfo$strand == 1])
start(mouseTSS[geneinfo$strand == -1]) <- end(mouseTSS[geneinfo$strand == -1])
return(mouseTSS[as.vector(as.data.frame(mouseTSS)$seqnames) %in% as.vector(all)])
})
#' @rdname getMouseTSS
setMethod(f = "getMouseTSS", signature = c("character"),
definition = function(chr) {
geneinfo = NULL
rda <- paste(system.file("rda", package = "diffloop"),
"geneinfo.m.rda", sep = "/")
load(rda)
mouseTSS <- GRanges(geneinfo[,c(1,2,3,4)])
end(mouseTSS[geneinfo$strand == 1]) <- start(mouseTSS[geneinfo$strand == 1])
start(mouseTSS[geneinfo$strand == -1]) <- end(mouseTSS[geneinfo$strand == -1])
return(mouseTSS[as.vector(as.data.frame(mouseTSS)$seqnames) %in% as.vector(all)])
})
#' Get protein coding gene regions
#'
#' \code{getMouseGenes} returns a \code{GRanges} object of all protein
#' coding genes genome-wide or within specified chromosomes. Annotation
#' is from regions from mm9 and protein coding genes.
#'
#' This function returns a \code{GRanges} object with the coordinates and
#' gene IDs of all protein coding genes either genome-wide
#' (by default) orspecified within a particular chromosome.
#'
#' @param chr A vector of chromosomes
#'
#' @return A GRanges object
#'
#' @examples
#' # Grab all protein coding gene locations genome-wide
#' pc.genes <- getMouseGenes()
#' # Grab all protein coding gene loctions on chromosome 1
#' chr1 <- getHumanGenes(c('1'))
#' @import GenomicRanges
#' @importFrom GenomeInfoDb sortSeqlevels seqlevels seqlevels<-
#' @importFrom S4Vectors queryHits subjectHits
#'
#' @export
setGeneric(name = "getMouseGenes", def = function(chr) standardGeneric("getMouseGenes"))
#' @rdname getMouseGenes
setMethod(f = "getMouseGenes", signature = c("missing"),
definition = function(chr) {
all <- c("1", "2", "3", "4", "5", "6", "7", "8", "9",
"10", "11", "12", "13", "14", "15", "16", "17", "18",
"19", "X", "Y")
rda <- paste(system.file("rda", package = "diffloop"),
"geneinfo.m.rda", sep = "/")
geneinfo <- ""
load(rda)
gi <- geneinfo[as.vector(as.data.frame(geneinfo)$chrom) %in% as.vector(all),c(1,2,3,4)]
return(GRanges(gi))
})
#' @rdname getMouseGenes
setMethod(f = "getMouseGenes", signature = c("character"),
definition = function(chr) {
rda <- paste(system.file("rda", package = "diffloop"),
"geneinfo.m.rda", sep = "/")
load(rda)
geneinfo <- ""
gi <- geneinfo[as.vector(as.data.frame(geneinfo)$chrom) %in% as.vector(all),c(1,2,3,4)]
return(GRanges(gi))
})
#' Annotate loops as Enhancer-Promoter or CTCF-CTCF
#'
#' \code{annotateLoops} adds a column to the rowData slot of a loops
#' object categorizing loops as either e-p (enhancer-promoter), p-p
#' (promoter-promoter), e-e (enhancer-enhancer), ctcf (CTCF-CTCF)
#' or none (no biological annotation).
#'
#' Function annotates loops where both anchors are near CTCF peaks or where
#' one anchor is near an enhancer and the other near a promoter. Consider using
#' functions \code{addchr}, \code{rmchr}, \code{bedToGRanges}, and \code{padGRanges}
#' when setting up the 3 GRanges inputs. Provide a blank GRanges objects to ignore
#' classification for one set.
#'
#' @param lto A loops object whose loops will be annotated
#' @param ctcf GRanges object corresponding to locations of CTCF peaks
#' @param enhancer GRanges object corresponding to locations of enhancer peaks
#' @param promoter GRanges object corresponding to locations of promoter regions
#'
#' @return A loops object with an additional row 'loop.type' in the rowData slot
#'
#' @examples
#' rda<-paste(system.file('rda',package='diffloop'),'loops.small.rda',sep='/')
#' load(rda)
#' ctcf_j <- system.file('extdata','Jurkat_CTCF_chr1.narrowPeak',package='diffloop')
#' ctcf <- rmchr(padGRanges(bedToGRanges(ctcf_j), pad = 1000))
#' h3k27ac_j <- system.file('extdata','Jurkat_H3K27ac_chr1.narrowPeak',package='diffloop')
#' h3k27ac <- rmchr(padGRanges(bedToGRanges(h3k27ac_j), pad = 1000))
#' promoter <- padGRanges(getHumanTSS(c('1')), pad = 1000)
#' # annotated_small <- annotateLoops(loops.small, ctcf, h3k27ac, promoter)
#'
#' @import GenomicRanges
#' @export
setGeneric(name = "annotateLoops", def = function(lto, ctcf,
enhancer, promoter) standardGeneric("annotateLoops"))
#' @rdname annotateLoops
setMethod(f = "annotateLoops", signature = c("loops", "GRanges",
"GRanges", "GRanges"), definition = function(lto, ctcf, enhancer,
promoter) {
lto.df <- summary(lto)
Ranchors <- GRanges(lto.df$chr_1, IRanges(lto.df$start_1, lto.df$end_1))
Lanchors <- GRanges(lto.df$chr_2, IRanges(lto.df$start_2, lto.df$end_2))
# Determine if right anchor is near CTCF peak
Rhits.c <- suppressWarnings(findOverlaps(ctcf, Ranchors,
maxgap = 0))
Rvalues.c <- rep(FALSE, dim(lto.df)[1])
Rvalues.c[unique(subjectHits(Rhits.c))] <- TRUE
# Determine if left anchor is near CTCF peak
Lhits.c <- suppressWarnings(findOverlaps(ctcf, Lanchors,
maxgap = 0))
Lvalues.c <- rep(FALSE, dim(lto.df)[1])
Lvalues.c[unique(subjectHits(Lhits.c))] <- TRUE
#######
# Determine if right anchor is near promoter region
Rhits.p <- suppressWarnings(findOverlaps(promoter, Ranchors,
maxgap = 0))
Rvalues.p <- rep(FALSE, dim(lto.df)[1])
Rvalues.p[unique(subjectHits(Rhits.p))] <- TRUE
# Determine if left anchor is near promoter region
Lhits.p <- suppressWarnings(findOverlaps(promoter, Lanchors,
maxgap = 0))
Lvalues.p <- rep(FALSE, dim(lto.df)[1])
Lvalues.p[unique(subjectHits(Lhits.p))] <- TRUE
#######
# Determine if right anchor is near enhancer peak
Rhits.e <- suppressWarnings(findOverlaps(enhancer, Ranchors,
maxgap = 0))
Rvalues.e <- rep(FALSE, dim(lto.df)[1])
Rvalues.e[unique(subjectHits(Rhits.e))] <- TRUE
# Determine if left anchor is near enhancer peak
Lhits.e <- suppressWarnings(findOverlaps(enhancer, Lanchors,
maxgap = 0))
Lvalues.e <- rep(FALSE, dim(lto.df)[1])
Lvalues.e[unique(subjectHits(Lhits.e))] <- TRUE
#######
ctcf.loops <- Lvalues.c & Rvalues.c
ee.loops <- as.integer(Lvalues.e & Rvalues.e) * 10
pp.loops <- as.integer(Lvalues.p & Rvalues.p) * 100
ep.loops <- as.integer((Lvalues.e & Rvalues.p) | (Lvalues.p & Rvalues.e)) * 1000
loop.types <- ctcf.loops + ee.loops + pp.loops + ep.loops
description <- rep("none", length(loop.types))
description[loop.types >= 1] <- "ctcf"
description[loop.types >= 10] <- "e-e"
description[loop.types >= 100] <- "p-p"
description[loop.types >= 1000] <- "e-p"
lto@rowData$loop.type <- description
return(lto)
})
#' @rdname annotateLoops
setMethod(f = "annotateLoops", signature = c("loops", "missing",
"GRanges", "GRanges"), definition = function(lto, ctcf, enhancer,
promoter) {
lto.df <- summary(lto)
Ranchors <- GRanges(lto.df$chr_1, IRanges(lto.df$start_1, lto.df$end_1))
Lanchors <- GRanges(lto.df$chr_2, IRanges(lto.df$start_2, lto.df$end_2))
#######
# Determine if right anchor is near promoter region
Rhits.p <- suppressWarnings(findOverlaps(promoter, Ranchors,
maxgap = 0))
Rvalues.p <- rep(FALSE, dim(lto.df)[1])
Rvalues.p[unique(subjectHits(Rhits.p))] <- TRUE
# Determine if left anchor is near promoter region
Lhits.p <- suppressWarnings(findOverlaps(promoter, Lanchors,
maxgap = 0))
Lvalues.p <- rep(FALSE, dim(lto.df)[1])
Lvalues.p[unique(subjectHits(Lhits.p))] <- TRUE
#######
# Determine if right anchor is near enhancer peak
Rhits.e <- suppressWarnings(findOverlaps(enhancer, Ranchors,
maxgap = 0))
Rvalues.e <- rep(FALSE, dim(lto.df)[1])
Rvalues.e[unique(subjectHits(Rhits.e))] <- TRUE
# Determine if left anchor is near enhancer peak
Lhits.e <- suppressWarnings(findOverlaps(enhancer, Lanchors,
maxgap = 0))
Lvalues.e <- rep(FALSE, dim(lto.df)[1])
Lvalues.e[unique(subjectHits(Lhits.e))] <- TRUE
#######
ee.loops <- as.integer(Lvalues.e & Rvalues.e) * 10
pp.loops <- as.integer(Lvalues.p & Rvalues.p) * 100
ep.loops <- as.integer((Lvalues.e & Rvalues.p) | (Lvalues.p & Rvalues.e)) * 1000
loop.types <- ee.loops + pp.loops + ep.loops
description <- rep("none", length(loop.types))
description[loop.types >= 10] <- "e-e"
description[loop.types >= 100] <- "p-p"
description[loop.types >= 1000] <- "e-p"
lto@rowData$loop.type <- description
return(lto)
})
#' Keep enhancer-promoter loops
#'
#' \code{keepEPloops} adds a column to the rowData slot of a loops
#' object that shows the corresponding TSS of a gene name based on
#' the promoter GRanges. The loops object is then subsetted and returns
#' only loops that are enhancer-promoter.
#'
#' This function works similar to the \code{annotateLoops} function but
#' returns only enhancer-promoter loops that are defined in this function.
#' Additionally, this function returns the gene name(s) of the nearby
#' transcription start sites in a comma-separted list if there are multiple.
#' These gene names are defined by the promoter GRanges mcol slot.
#'
#' @param lto A loops object whose loops will be annotated
#' @param enhancer GRanges object corresponding to locations of enhancer peaks
#' @param promoter GRanges object corresponding to locations of promoter regions
#'
#' @return A loops object with an additional row 'loop.type' in the rowData slot
#' in addition to the gene.tss (which has the gene name) and the
#' anchor.tss which shows the anchor(s) near the promoter region for the gene.
#'
#' @examples
#' rda<-paste(system.file('rda',package='diffloop'),'loops.small.rda',sep='/')
#' load(rda)
#' h3k27ac_j <- system.file('extdata','Jurkat_H3K27ac_chr1.narrowPeak',package='diffloop')
#' h3k27ac <- rmchr(padGRanges(bedToGRanges(h3k27ac_j), pad = 1000))
#' promoter <- padGRanges(getHumanTSS(c('1')), pad = 1000)
#' # small.ep <- keepEPloops(loops.small, h3k27ac, promoter)
#'
#' @import GenomicRanges
#' @importFrom stats aggregate
#' @export
setGeneric(name = "keepEPloops", def = function(lto,
enhancer, promoter) standardGeneric("keepEPloops"))
#' @rdname keepEPloops
setMethod(f = "keepEPloops", signature = c("loops",
"GRanges", "GRanges"), definition = function(lto, enhancer,
promoter) {
lto.df <- summary(lto)
Ranchors <- GRanges(lto.df$chr_1, IRanges(lto.df$start_1, lto.df$end_1))
Lanchors <- GRanges(lto.df$chr_2, IRanges(lto.df$start_2, lto.df$end_2))
# Determine if right anchor is near promoter region
Rhits.p <- suppressWarnings(findOverlaps(promoter, Ranchors,
maxgap = 0))
Rvalues.p <- rep(FALSE, dim(lto.df)[1])
Rvalues.p[unique(subjectHits(Rhits.p))] <- TRUE
# Determine if left anchor is near promoter region
Lhits.p <- suppressWarnings(findOverlaps(promoter, Lanchors,
maxgap = 0))
Lvalues.p <- rep(FALSE, dim(lto.df)[1])
Lvalues.p[unique(subjectHits(Lhits.p))] <- TRUE
# Aggregate TSS
Rtss <- data.frame(Rhits.p)
Rtss <- cbind(Rtss, mcols(promoter[Rtss$queryHits]))
Ltss <- data.frame(Lhits.p)
Ltss <- cbind(Ltss, mcols(promoter[Ltss$queryHits]))
ttss <- unique(rbind(Rtss, Ltss)[,c(2,3)])
tss <- aggregate(gene~subjectHits,paste,collapse=",",data=ttss)
#######
# Determine if right anchor is near enhancer peak
Rhits.e <- suppressWarnings(findOverlaps(enhancer, Ranchors, maxgap = 0))
Rvalues.e <- rep(FALSE, dim(lto.df)[1])
Rvalues.e[unique(subjectHits(Rhits.e))] <- TRUE
# Determine if left anchor is near enhancer peak
Lhits.e <- suppressWarnings(findOverlaps(enhancer, Lanchors,
maxgap = 0))
Lvalues.e <- rep(FALSE, dim(lto.df)[1])
Lvalues.e[unique(subjectHits(Lhits.e))] <- TRUE
#######
#Add annotation and subset
ep.loops <- (Lvalues.e & Rvalues.p) | (Lvalues.p & Rvalues.e)
gene.tss <- rep("none", dim(lto)[2])
anchor.tss <- rep("none", dim(lto)[2])
anchor.tss[which(Rvalues.p & Lvalues.p)] <- "1,2"
anchor.tss[which(Lvalues.p)] <- "1"
anchor.tss[which(Rvalues.p)] <- "2"
gene.tss[tss$subjectHits] <- tss$gene
lto@rowData$loop.type <- "e-p"
lto@rowData$gene.tss <- gene.tss
lto@rowData$anchor.tss <- anchor.tss
new.loops <- subsetLoops(lto, ep.loops)
return(new.loops)
})
#' Keep CTCF loops
#'
#' \code{keepCTCFloops} returns loops that are nearby CTCF peaks
#' as determined by some external data in a GRanges object
#'
#' This function works similar to the \code{annotateLoops} function but
#' returns only CTCF loops that are defined in this function.
#' However, loops in \code{annotateLoops} may have a different annotation
#' based on their priority scheme. For example, an e-p loop from
#' \code{annotateLoops} may be returned as a CTCF loop by this function
#' if the loop had both annotations. These peaks don't necessarily
#' need to be CTCF peaks, so using a GRanges object with enhancers
#' or promoters to determine e-e loops and p-p loops could also
#' be used in this function
#'
#' @param lto A loops object whose loops will be annotated
#' @param ctcf GRanges object corresponding to locations of CTCF peaks
#'
#' @return A loops object with all loops having both anchors in the GRanges region
#'
#' @examples
#' rda<-paste(system.file('rda',package='diffloop'),'loops.small.rda',sep='/')
#' load(rda)
#' ctcf_j <- system.file('extdata','Jurkat_CTCF_chr1.narrowPeak',package='diffloop')
#' ctcf <- rmchr(padGRanges(bedToGRanges(ctcf_j), pad = 1000))
#' # small.ctcf <- keepCTCFloops(loops.small, ctcf)
#'
#' @import GenomicRanges
#' @importFrom stats aggregate
#' @export
setGeneric(name = "keepCTCFloops", def = function(lto,
ctcf) standardGeneric("keepCTCFloops"))
#' @rdname keepCTCFloops
setMethod(f = "keepCTCFloops", signature = c("loops",
"GRanges"), definition = function(lto, ctcf) {
lto.df <- summary(lto)
Ranchors <- GRanges(lto.df$chr_1, IRanges(lto.df$start_1, lto.df$end_1))
Lanchors <- GRanges(lto.df$chr_2, IRanges(lto.df$start_2, lto.df$end_2))
# Determine if right anchor is near CTCF peak
Rhits.c <- suppressWarnings(findOverlaps(ctcf, Ranchors,
maxgap = 0))
Rvalues.c <- rep(FALSE, dim(lto.df)[1])
Rvalues.c[unique(subjectHits(Rhits.c))] <- TRUE
# Determine if left anchor is near CTCF peak
Lhits.c <- suppressWarnings(findOverlaps(ctcf, Lanchors,
maxgap = 0))
Lvalues.c <- rep(FALSE, dim(lto.df)[1])
Lvalues.c[unique(subjectHits(Lhits.c))] <- TRUE
ctcf.loops <- Lvalues.c & Rvalues.c
new.loops <- subsetLoops(lto, ctcf.loops)
return(new.loops)
})
#' Annotate enhancer-promoter loops with differential gene expression
#'
#' \code{annotateLoops.dge} adds columns to the rowData slot of a loops
#' object that shows summary statistics corresponding TSS of a gene name
#' based on the genes.tss rowData column. This function should be used
#' following the \code{keepEPloops} function.
#'
#' This function links enhancer-promoter loops and differential gene
#' expression results. The rownames of the deseq_res slots should correspond
#' to the gene names in the gene.tss column of the rowData slot of the loops
#' object. The function returns a loops object if multiple is specified as FALSE
#' which is the case by default. Otherwise, if multiple is TRUE, then this
#' function returns a data frame since each loop may have more than moe TSS.
#' One can reproduce this dataframe quickly when multiple = FALSE using the
#' summary() function on the returned loops object.
#'
#' @param lto A loops object whose loops will be annotated
#' @param deseq_res A data.frame
#' @param multiple Annotate loops with multiple TSS? Default = FALSE
#'
#' @return A loops object if multiple = FALSE or data frame if multiple = TRUE
#'
#' @examples
#' rda<-paste(system.file('rda',package='diffloop'),'loops.small.rda',sep='/')
#' load(rda)
#' h3k27ac_j <- system.file('extdata','Jurkat_H3K27ac_chr1.narrowPeak',package='diffloop')
#' h3k27ac <- rmchr(padGRanges(bedToGRanges(h3k27ac_j), pad = 1000))
#' promoter <- padGRanges(getHumanTSS(c('1')), pad = 1000)
#' #small.ep <- keepEPloops(loops.small, h3k27ac, promoter)
#' #ADD SOMETHING HERE.
#'
#' @import GenomicRanges
#' @importFrom data.table as.data.table
#' @export
setGeneric(name = "annotateLoops.dge", def = function(lto,
deseq_res, multiple = FALSE) standardGeneric("annotateLoops.dge"))
#' @rdname annotateLoops.dge
setMethod(f = "annotateLoops.dge", definition = function(lto, deseq_res, multiple = FALSE) {
if(!"gene.tss" %in% colnames(lto@rowData)){
stop("Must have gene.tss column in rowData slot!")
}
res.dt <- as.data.table(as.data.frame(deseq_res), keep.rownames=TRUE)
diffloop_results <- lto@rowData
diffloop_results <- dplyr::add_rownames(diffloop_results, "idx")
#E-P loops with one gene
dup <- grep(",", diffloop_results$gene.tss)
onehit <- diffloop_results[-dup,]
if(!multiple){
idx <- "" # hack to avoid note
onehit.dt <- as.data.table(onehit)
c.onehit <- merge(onehit.dt, res.dt, by.x = c("gene.tss"),by.y=c("rn"))
c.onehit <- dplyr::arrange(c.onehit, as.numeric(idx))
ida <- c.onehit$idx
gene.tss <- c.onehit$gene.tss
c.onehit <- within(c.onehit, rm("idx", "gene.tss"))
c.onehit$gene.tss <- gene.tss
new.loops <- subsetLoops(lto, as.numeric(ida))
new.loops@rowData <- c.onehit
return(new.loops)
} else {
diffloop_results <- summary(lto)
#E-P loops with one gene
dup <- grep(",", diffloop_results$gene.tss)
onehit <- diffloop_results[-dup,]
#Handle E-P loops with multiple genes
multhits <- diffloop_results[dup,]
multgenes <- strsplit(multhits$gene.tss, ",")
multhits.long <- data.frame()
for(i in 1:length(multgenes)){
t <- cbind(data.frame(multgenes[[i]]), do.call("rbind", replicate(length(multgenes[[i]]), multhits[i,], simplify = FALSE)))
t$gene.tss <- t[,1]
t <- t[,-1]
multhits.long <- rbind(multhits.long, t)
}
#Combine; make data.tables
allhits <- rbind(onehit, multhits.long)
allhits.dt <- as.data.table(allhits)
#Link DESeq results
c.allhits <- merge(allhits.dt, res.dt, by.x = c("gene.tss"),by.y=c("rn"))
gene.tss <- c.allhits$gene.tss
c.allhits <- within(c.allhits, rm("gene.tss"))
c.allhits$gene.tss <- gene.tss
return(data.frame(c.allhits))
}
})
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.