Nothing
#####################################################################
#####FUNCTIONS QC-metrics for narrow binding PROFILES ###########
#####################################################################
#' @keywords internal
## format conversion (spp; Koustav)
f_convertFormatBroadPeak <- function(given.clusters) {
chrl <- names(given.clusters)
names(chrl) <- chrl
chrl <- chrl[unlist(lapply(given.clusters, function(d) length(d$s))) > 0]
md <- do.call(rbind, lapply(chrl, function(chr) {
df <- given.clusters[[chr]]
data.frame(chr = chr, start = df$s, end = df$e, name = ".",
score = "0",
strand = ".", rv = df$rv, A1 = -1, A2 = -1)
}))
md <- md[order(as.numeric(md[, 7]), decreasing = TRUE), ]
md <- data.frame(md)
md.minimal <- md[, c(1, 2, 3)]
return(md.minimal)
}
#' @keywords internal
## format conversion (spp;Koustav)
f_converNarrowPeakFormat <- function(bd, margin = bd$whs) {
if (is.null(margin)) {
margin <- 50
}
chrl <- names(bd$npl)
names(chrl) <- chrl
md <- do.call(rbind, lapply(chrl, function(chr) {
df <- bd$npl[[chr]]
x <- df$x
rs <- df$rs
if (is.null(rs)) {
rs <- rep(NA, length(x))
}
re <- df$re
if (is.null(re)) {
re <- rep(NA, length(x))
}
ivi <- which(is.na(rs))
if (any(ivi)) {
rs[ivi] <- x[ivi] - margin
}
ivi <- which(is.na(re))
if (any(ivi)) {
re[ivi] <- x[ivi] + margin
}
data.frame(chr = chr, start = rs, end = re, name = ".",
score = "0", strand = ".",
y = df$y, A1 = -1,
fdr = format(df$fdr, scientific = TRUE, digits = 3),
dist = x - rs)
}))
md <- md[order(as.numeric(md[, 7]), decreasing = TRUE), ]
md <- data.frame(md)
md.minimal <- md[, c(1, 2, 3)]
return(md.minimal)
}
#' @keywords internal
## writing wig files
## writewig function of spp with different track type header
f_writewig <- function (dat, fname, feature, threshold = 5, zip = FALSE)
{
chrl <- names(dat)
names(chrl) <- chrl
invisible(lapply(chrl, function(chr) {
bdiff <- dat[[chr]]
ind <- seq(1, length(bdiff$x))
ind <- ind[!is.na(bdiff$y[ind])]
header <- chr == chrl[1]
f_write.probe.wig(chr, bdiff$x[ind], bdiff$y[ind], fname,
append = !header, feature = feature, header = header)
}))
if (zip) {
zf <- paste(fname, "zip", sep = ".")
system(paste("zip \"", zf, "\" \"", fname, "\"", sep = ""))
system(paste("rm \"", fname, "\"", sep = ""))
return(zf)
} else {
return(fname)
}
}
#' @keywords internal
## function used by writewig function
##Here I am using a slightly changed version of write.probe.wig from spp.
##This version changes the track type in the header to "bedGraph"
##ORIGINAL Version can be found in spp package!!!
f_write.probe.wig <- function(chr, pos, val, fname, append=FALSE, feature="M",
probe.length=35, header=TRUE)
{
min.dist <- min(diff(pos));
if(probe.length>=min.dist) {
probe.length <- min.dist-1;
cat("warning: adjusted down wig segment length to", probe.length,"\n");
}
mdat <- data.frame(chr, as.integer(pos), as.integer(pos+probe.length), val)
if(header) {
write(paste("track type=bedGraph name=\"Bed Format\" description=\"",
feature,"\" visibility=dense color=200,100,0 altColor=0,100,200
priority=20",sep=""),file=fname,append=append)
write.table(mdat, file=fname, col.names=FALSE, row.names=FALSE,
quote=FALSE, sep=" ", append=TRUE);
} else {
write.table(mdat, file=fname, col.names=FALSE, row.names=FALSE,
quote=FALSE, sep=" ", append=append);
}
}
#' @keywords internal
## reads bam file or tagalign file
f_readFile <- function(filename, reads.aligner.type) {
currentFormat <- get(paste("read", reads.aligner.type, "tags", sep = "."))
if (reads.aligner.type == "bam") {
data <- currentFormat(file.path(paste(filename, ".bam", sep = "")))
}
if (reads.aligner.type == "tagalign") {
data <- currentFormat(file.path(paste(filename,".tagAlign", sep = "")))
}
## readCount=sum(sapply(data$tags, length))
## readCount <- sum(unlist(lapply(data$tags, length)))
readCount <- sum(lengths(data$tags))
message(readCount," reads")
##double check data structure to make sure the structure contains
##two lists called $tags and $quality
helper=data
data=NULL
data$tags=helper$tags
data$quality=helper$quality
return(data)
}
#' @keywords internal
## filters canonical chromosomes
f_clearChromStructure <- function(structure, annotationID) {
##delete empty chromosomes
message("load chrom_info")
chInf <- f_chromInfoLoad(annotationID)
dCheck <- NULL
##check on regulare chromosomes
ss=structure
if (is.null(ss$tags)){
dCheck <- structure[ which( names(ss) %in% names(chInf) )]
#if (sum(sapply(dCheck, is.null ))>0){
if (sum ( unlist (lapply(dCheck, is.null )))>0){
#dCheck<- dCheck[ -which( sapply(dCheck, is.null ))]
dCheck<- dCheck[ -which( unlist (lapply(dCheck, is.null )))]
}
}else{
dCheck$tags <- ss$tags[ which( names(ss$tags) %in% names(chInf) )]
dCheck$quality <-
ss$quality[ which( names(ss$quality) %in% names(chInf) )]
#if (sum(sapply(dCheck$tags, is.null ))>0) {
if (sum (unlist (lapply (dCheck$tags, is.null )))>0) {
#dCheck$tags <-
#dCheck$tags[ -which( sapply(dCheck$tags, is.null ))]
dCheck$tags <-
dCheck$tags[ -which( unlist(lapply(dCheck$tags, is.null )))]
dCheck$quality <-
dCheck$quality[ -which(
unlist(lapply(dCheck$quality, is.null )))]
#dCheck$quality[ -which( sapply(dCheck$quality, is.null ))]
}
}
return(dCheck)
}
#' @keywords internal
## caluclate QC tag
f_qflag <- function(RSC) {
qflag <- NA
if ((RSC >= 0) & (RSC < 0.25)) {
qflag <- -2
} else if ((RSC >= 0.25) & (RSC < 0.5)) {
qflag <- -1
} else if ((RSC >= 0.5) & (RSC < 1)) {
qflag <- 0
} else if ((RSC >= 1) & (RSC < 1.5)) {
qflag <- 1
} else if ((RSC >= 1.5)) {
qflag <- 2
}
return(qflag)
}
#' @keywords internal
## selet informative tags
f_removeLocalTagAnomalies <- function(chip, input, chip_b.characteristics,
input_b.characteristics, remove.local.tag.anomalies,
select.informative.tags)
{
message("Filter tags")
if (select.informative.tags) {
message("select.informative.tags filter")
chipSelected <- select.informative.tags(chip,
chip_b.characteristics)
inputSelected <- select.informative.tags(input,
input_b.characteristics)
} else {
message("SKIP select.informative.tags filter")
chipSelected <- chip$tags
inputSelected <- input$tags
}
if (remove.local.tag.anomalies) {
message("remove.local.tag.anomalies filter")
## restrict or remove singular positions with very high tag counts
chipSelected <- remove.local.tag.anomalies(chipSelected)
inputSelected <- remove.local.tag.anomalies(inputSelected)
} else {
message("SKIP remove.local.tag.anomalies filter")
inputSelected <- input$tags
chipSelected <- chip$tags
}
finalList <- list(input.dataSelected = inputSelected,
chip.dataSelected = chipSelected)
return(finalList)
}
#' @keywords internal
## takes dataSelected as input, parallel is the number of CPUs used for
## parallelization
f_tagDensity <- function(data, tag.shift, chromDef, mc = 1) {
## Smoothing parameters
smoothingStep <- 20 ###is size of sw###before it was 10
smoothingBandwidth <- 50
## density distribution for data
message("Smooth tag density")
## tag smoothing, (sum of tags in all chr)/1e6
## ts <- sum(unlist(lapply(data, length)))/1e+06
ts <- sum(lengths(data)) / 1e6
### parallelisation
chromosomes_list <- names(data)
### creates a list of lists
data <- lapply(chromosomes_list, FUN = function(x) {
return(data[x])
})
#smoothed.density <- BiocParallel::bplapply(data,
# BPPARAM = BiocParallel::MulticoreParam(workers = mc),
smoothed.density<-mclapply(data,
mc.preschedule = FALSE,
mc.cores = mc,
FUN = function(current_chr_list) {
current_chr <- names(current_chr_list)
str(current_chr_list)
if (length(current_chr) != 1) {
stop("unexpected dataSelected structure")
}
spp::get.smoothed.tag.density(current_chr_list,
bandwidth = smoothingBandwidth,
step = smoothingStep, tag.shift = tag.shift,
rngl = chromDef[current_chr])
})
smoothed.density <- (unlist(smoothed.density, recursive = FALSE))
## normalizing smoothed tag density by library size
smoothed.density <- lapply(smoothed.density, function(d) {
d$y <- d$y/ts
return(d)
})
return(smoothed.density)
}
#' @keywords internal
# Author: Koustav Pal, Contact: koustav.pal@ifom.eu, Affiliation: IFOM - FIRC
# Institute of Molecular Oncology Why use this function and not the
# GenomicRanges
# function makeGRangesFromDataFrame? 1. This function creates names out of
# coordinates and does not inherit names from a data.frame as
# the default package
# function does. If it must be inherited, it can be specified with Names=Args.
# This provides better control on what the names are for each row and in some
# cases ensures that users are proactive when using the %in% construct. Cause
# the default behaviour is to construct new names, when users use an %in%
# construct they will encounter an error when older names are not recycled,
# forcing them to use the Names=Args parameter. 2. Using cbind > rbind >
# as.data.frame creates a scenario where first a matrix is created and then
# cast
# as a data.frame. In a matrix all points must be of the same type. Therefore,
# when a single row/col contains a character type value all values are cast as
# factors. makeGRangesFromDataFrame sticks to R principles and and produces no
# errors. Rather, as it expects to encounter a numeric type in start and end
# cols, all factors are cast back to numeric using the as.character >as.numeric
# construct. While this behaviour is helpful, it needlessly uses two additional
# steps to circumvent a probable bug. MakeGRangesObject does no such thing and
# will return an error as IRanges does not accept factor values.
f_makeGRangesObject <- function(Chrom = NULL, Start = NULL, End = NULL,
Strand = NULL, Names = NULL, Sep = ":")
{
# require(GenomicRanges)
if (is.null(Names)) {
Names <- paste(Chrom, as.integer(Start), as.integer(End), sep = Sep)
}
if (is.null(Strand)) {
Strand <- rep("*", length(Chrom))
}
Object <- GenomicRanges::GRanges(seqnames = Rle(Chrom),
ranges = IRanges(Start, end = End, names = Names),
strand = Rle(strand(Strand)))
return(Object)
}
# Author: Koustav Pal, Contact: koustav.pal@ifom.eu, Affiliation: IFOM - FIRC
# Institute of Molecular Oncology Creates non-overlapping regions from
#' @keywords internal
f_reduceOverlappingRegions <- function(ranges = NULL)
{
# Try it out.
rangesByChrom <- split(ranges,seqnames(ranges))
##loop over chromosomes
nonOverlappingRanges <- lapply(rangesByChrom,function(chromFrame){
myRange <- chromFrame[order(start(chromFrame))]
Chrom <- unique(as.vector(seqnames(myRange)))
#print(Chrom)
Q.name <- "Queries"
S.name <- "Subject"
##getting overlap object with the hits
HitsObject <- findOverlaps(query = myRange, subject = myRange)
PairList.hits <- HitsObject[
queryHits(HitsObject) != subjectHits(HitsObject)]
PairList <- data.frame(queryHits(PairList.hits),
subjectHits(PairList.hits))
colnames(PairList) <- c(Q.name,S.name)
#3number of non overlapping reagions
No.overlaps <- myRange[!(seq_along(myRange) %in% PairList[,Q.name])]
if (nrow(PairList) == 0) {
return(No.overlaps)
}
PairList.gt <- PairList[PairList[,S.name] > PairList[,Q.name],]
PairList.lt <- PairList[PairList[,S.name] < PairList[,Q.name],]
colnames(PairList.lt) <- c(S.name,Q.name)
PairList.lt <- PairList.lt[,c(Q.name,S.name)]
PairList <- rbind(PairList.lt,PairList.gt)
PairList <- PairList[order(PairList[,Q.name]),]
unique.queries <- unique(PairList[,Q.name])
unique.subjects <- unique(PairList[,S.name])
Which.q <- unique.queries[which(
!(unique.queries %in% unique.subjects))]
Which.s <- unique.subjects[which(
!(unique.subjects %in% unique.queries))]
#if(length(Which.q)!=length(Which.s)){
# stop("Cannot resolve overlaps. Contact the writer of the function
# to deconvolute the logic!\n")
#}
if(length(Which.q)!=length(Which.s)){
message("Skipping for overlapping analysis ",Chrom)
return(No.overlaps)
}else{
Starts <- start(myRange[Which.q])
Ends <- sapply(seq_along(Which.s),function(x){
Index <- Which.q[x]:Which.s[x]
max(end(myRange[Index]))
})
NewRanges <- f_makeGRangesObject(Chrom=rep(Chrom,length(Starts)),
Start= Starts, End= Ends)
return(c(No.overlaps,NewRanges))
}
})
nonOverlappingRangesFinal <- do.call(c,
unlist(nonOverlappingRanges,use.names = FALSE))
return(nonOverlappingRangesFinal)
}
###Orig function
#ReduceOverlappingRegions <- function(Ranges = NULL)
#{
# # Try it out.
# Ranges.split <- split(Ranges,seqnames(Ranges))
# Non.overlapping.ranges.list <- lapply(Ranges.split,function(Range){
# My.Range <- Range[order(start(Range))]
# Chrom <- unique(as.vector(seqnames(My.Range)))
# Q.name <- "Queries"
# S.name <- "Subject"
# HitsObject <- findOverlaps(query = My.Range, subject = My.Range)
# PairList.hits <-
# HitsObject[queryHits(HitsObject) != subjectHits(HitsObject)]
# PairList <- data.frame(queryHits(PairList.hits),
# subjectHits(PairList.hits))
# colnames(PairList) <- c(Q.name,S.name)
# No.overlaps <- My.Range[!(seq_along(My.Range)
# %in% PairList[,Q.name])]
# if (nrow(PairList) == 0) {
# return(No.overlaps)
# }
# PairList.gt <- PairList[PairList[,S.name] > PairList[,Q.name],]
# PairList.lt <- PairList[PairList[,S.name] < PairList[,Q.name],]
# colnames(PairList.lt) <- c(S.name,Q.name)
# PairList.lt <- PairList.lt[,c(Q.name,S.name)]
# PairList <- rbind(PairList.lt,PairList.gt)
# PairList <- PairList[order(PairList[,Q.name]),]
#
# unique.queries <- unique(PairList[,Q.name])
# unique.subjects <- unique(PairList[,S.name])
#
# Which.q <- unique.queries[
# which(!(unique.queries %in% unique.subjects))]
# Which.s <- unique.subjects[
# which(!(unique.subjects %in% unique.queries))]
#
# if(length(Which.q)!=length(Which.s)){
# stop("Cannot resolve overlaps. Contact the writer of the function
# to deconvolute the logic!\n")
# }
#
# Starts <- start(My.Range[Which.q])
# Ends <- sapply(seq_along(Which.s),function(x){
# Index <- Which.q[x]:Which.s[x]
# max(end(My.Range[Index]))
# })
#
# NewRanges <- MakeGRangesObject(Chrom=
# rep(Chrom,length(Starts)),Start= Starts, End= Ends)
# return(c(No.overlaps,NewRanges))
# })
# Non.overlapping.ranges <- do.call(c,
# unlist(Non.overlapping.ranges.list,use.names = FALSE))
# return(Non.overlapping.ranges)
#}
#####################################################################
######### FUNCTIONS QC-metrics for global read distribution #########
#' @keywords internal
f_shortenFrame <- function(smothDens) {
## shorten frame with cumulative distribution
new <- lapply(smothDens, function(chromelement) {
tt <- NULL
tt$x <- chromelement$x[seq.int(1, length(chromelement$x), 6)]
tt$y <- chromelement$y[seq.int(1, length(chromelement$y), 6)]
return(tt)
})
# for (i in chrl) { ##print(i)
###appending tags and quality elements of all
# inputs to newControl new[[i]]$x=smothDens[[i]]$x[seq.int(1,
# length(smothDens[[i]]$x), 6)] new[[i]]$y=smothDens[[i]]$y[seq.int(1,
# length(smothDens[[i]]$y), 6)] }
return(new)
}
#' @keywords internal
## sorts and bins the dataframe
f_sortAndBinning <- function(shortframe) {
shortframeSorted <- sort(shortframe)
BINS_NUMBER <- 10000
cumSum <- cumsum(shortframeSorted)
cumSumBins <- quantile(cumSum, probs = seq(0, 1, (1/BINS_NUMBER)))
pj <- (cumSumBins/cumSum[length(cumSum)])
normalizedCumSum <- data.frame(x = seq(0, 1, (1/BINS_NUMBER)), pj = pj)
rownames(normalizedCumSum) <- NULL
return(normalizedCumSum)
}
## The plot function creates the 'Fingerprint plot' i.e. the cumulative
## distribution of the read counts across genomic bins for Input and ChIP.
## cumChip
## The cumulative distribution of the read counts for the ChIP cumInput The
## cumulative distribution of the read counts for the Input savePlotPath
#' @keywords internal
f_fingerPrintPlot <- function(cumChip, cumInput, savePlotPath = NULL) {
if (!is.null(savePlotPath)) {
filename <- file.path(savePlotPath, "FingerPrintPlot.pdf")
pdf(file = filename, width = 7, height = 7)
}
plot(cumChip, type = "l", col = "blue", lwd = 2,
xlab = "Fraction of bins",
ylab = "Fr. of total reads coverage",
main = "Fingerprint: global read distribution")
lines(cumInput, col = "red", lwd = 2)
arrowx <- cumChip[which.max(abs(cumInput$pj - cumChip$pj)), ]$x
abline(v = arrowx, col = "green", lty = 2, lwd = 2)
## abline(h=schneidePunktY,col='cyan',lty=2,lwd=2)
## abline(v=schneidePunktX,col='cyan',lty=2,lwd=2)
legend("topleft", legend = c("Input", "ChIP"), fill = c("red", "blue"))
if (!is.null(savePlotPath)) {
dev.off()
}
}
#####################################################################
########## FUNCTIONS QC-metrics for local ENRICHMENT
############################################################
# BEGINF OF PETER Kharchenko's
# FUNCTIONS FOR BIN AVERAGES + SCALING OF METAGENES
############################################################
# - f_feature.bin.averages - f_two.point.scaling - f_one.point.scaling
#' @keywords internal
## calculates bin average values for a list of one- or two-point
## features dat -
## $x/$y data frame feat - data frame with either
## $x/{$strand} or $s/$e/{$strand}
## return.scaling causes function to just calculate, cleanup and return scaling
## mapping for further use
f_feature.bin.averages <- function(dat, feat, nu.feat.omit = FALSE,
nu.point.omit = TRUE, scaling = NULL, return.scaling = FALSE,
trim = 0, min.feature.size = NULL, m = 4020/2, ...)
{
if (is.null(feat)) {
return(NULL)
}
if (dim(feat)[1] < 1) {
return(NULL)
}
if (is.null(scaling)) {
## calculate scaling
if (!is.null(feat$e)) {
## two-point
scaling <- f_two.point.scaling(dat$x, feat, ...)
} else {
scaling <- f_one.point.scaling(dat$x, feat$x, feat$strand,
m = m, ...)
}
## clean up
if (nu.feat.omit) {
scaling <- scaling[!scaling$si %in%
unique(scaling$si[scaling$nu > 1]), ]
} else {
if (nu.point.omit) {
scaling <- scaling[scaling$nu == 1, ]
}
}
if (return.scaling) {
return(scaling)
}
}
if (!is.null(min.feature.size)) {
if (!is.null(feat$e)) {
iindex <- which(feat$e - feat$s >= min.feature.size)
scaling <- scaling[scaling$si %in% iindex, ]
}
}
## determine and return gene bin average table
return(tapply(dat$y[scaling$i],
list(factor(scaling$si, levels = c(1:dim(feat)[1])), scaling$bin),
function(x) mean(na.omit(x))))
## return(tapply(dat$y[scaling$i],list(scaling$si,scaling$bin),
## mean,trim=trim,na.rm=T))
}
#' @keywords internal
## identifies points located within margins of given segments and
## returns rescaled
## relative positions m - margin; om - outer margin;
## im - inner margin; rom - right outer margin, etc.
## x - x positions being mapped/rescaled
## seg - $s/$e/{$strand} data frame return $i/$rx/$nu data
## frame corresponding to indecies of original x values ($i)
## and new relative $rx positions, and whether
## a point maps to several segments $r5x/$r3x give distances relative to the 5'
## and 3' ends, $si gives segment index $bin index factor is added if nbins is
## supplied two.point.scaling <- function(x,seg,m=1e3,bs=gene_body,om=m,im=m,
## rom=om,lom=om, rim=im,lim=im,nbins=predefnum_bins/2) {
f_two.point.scaling <- function(x, seg, bs = 2000, im, rom, lom, nbins = 301)
{
rim <- im
lim <- im
## map points to the segments defined by outer margins
nseg <- length(seg$s)
if (!is.null(seg$strand)) {
nsi <- which(seg$strand == "-")
ml <- rep(lom, nseg)
ml[nsi] <- rom
mr <- rep(rom, nseg)
mr[nsi] <- lom
sg5 <- seg$s
sg5[nsi] <- seg$e[nsi]
sg3 <- seg$e
sg3[nsi] <- seg$s[nsi]
sstrand <- ifelse(seg$strand == "-", -1, 1)
sstrand[is.na(sstrand)] <- 1
} else {
ml <- rep(lom, nseg)
mr <- rep(rom, nseg)
sg5 <- seg$s
sg3 <- seg$e
sstrand <- rep(1, length(sg5))
}
spi <- spp::points_within(x, seg$s - ml, seg$e + mr,
return.list = TRUE)
#spi <- spp::points_withinFunction(x, seg$s - ml, seg$e + mr,
#return.list = TRUE)
lspi <- unlist(lapply(spi, length))
#df <- data.frame(i = rep(1:length(x), lspi), si = unlist(spi),
# nu = rep(lspi, lspi))
df <- data.frame(i = rep(seq_along(x), lspi), si = unlist(spi),
nu = rep(lspi, lspi))
df$r5x <- (x[df$i] - sg5[df$si]) * sstrand[df$si]
df$r3x <- (x[df$i] - sg3[df$si]) * sstrand[df$si]
## between g3-rim and g3+rom - unscaled
df$rx <- rep(NA, length(df$i))
vi <- which(df$r3x >= -rim)
df$rx[vi] <- df$r3x[vi] + lim + rim + bs
## df$rx <- df$r3x+lim+rim+bs; body scaling
vi <- which(df$r3x < -rim & df$r5x > lim)
df$rx[vi] <- ((df$r5x[vi] - lim)/
((seg$e - seg$s)[df$si[vi]] - lim - rim)) * bs + lim
## between g5-lom and g5+lim
vi <- which(df$r5x <= lim)
df$rx[vi] <- df$r5x[vi]
if (!is.null(nbins)) {
breaks <- seq(-lom, lim + bs + rim + rom,
by = (lom + lim + bs + rim + rom)/nbins)
rn <- breaks[-1] - diff(breaks)/2
breaks[1] <- breaks[1] - 1
breaks[length(breaks)] <- breaks[length(breaks)] + 1
df$bin <- cut(df$rx, breaks, labels = rn)
}
return(df)
}
#' @keywords internal
## much simpler one-point scaling that returns relative positions in the format
## similar to the two-point scaling ($i/$rx/$nu/$si) one.point.scaling <-
## function(x, pos, strand=NULL, m=1e3, lm=m, rm=m, nbins=predefnum_bins/2) {
## m=2010
## f_one.point.scaling <- function(x, pos, strand=NULL,m=up_downStream/2,
## lm=m, rm=m, nbins=predefnum_bins/2)
## { f_one.point.scaling <- function(x, pos,
## strand=NULL,m=4020/2, lm=m, rm=m, nbins=301/2) {
f_one.point.scaling <- function(x, pos, strand = NULL, nbins = 201, m = 4020/2)
{
lm <- m
rm <- m
if (is.null(pos)) {
return(NULL)
}
nseg <- length(pos)
if (nseg < 1) {
return(NULL)
}
ml <- rep(lm, nseg)
mr <- rep(rm, nseg)
if (!is.null(strand)) {
nsi <- which(strand == "-")
ml[nsi] <- rm
mr[nsi] <- lm
sstrand <- ifelse(strand == "-", -1, 1)
sstrand[is.na(sstrand)] <- 1
} else {
sstrand <- rep(1, length(pos))
}
spi <- spp::points_within(x, pos - ml, pos + mr,
return.list = TRUE)
#spi <- spp::points_withinFunction(x, pos - ml, pos + mr,
#return.list = TRUE)
lspi <- unlist(lapply(spi, length))
#df <- data.frame(i = rep(1:length(x), lspi), si = unlist(spi),
# nu = rep(lspi, lspi))
df <- data.frame(i = rep(seq_along(x), lspi), si = unlist(spi),
nu = rep(lspi, lspi))
df$rx <- (x[df$i] - pos[df$si]) * sstrand[df$si]
if (!is.null(nbins)) {
if (is.null(strand)) {
breaks <- seq(-lm, rm, by = (lm + rm)/nbins)
} else {
breaks <- seq(-max(lm, rm), max(lm, rm), by = (lm + rm)/nbins)
}
rn <- breaks[-1] - diff(breaks)/2
breaks[1] <- breaks[1] - 1
breaks[length(breaks)] <- breaks[length(breaks)] + 1
df$bin <- cut(df$rx, breaks, labels = rn)
}
return(df)
}
########################################################################
# END OF PETER K.'s FUNCTIONS FOR BIN AVERAGES + SCALING OF METAGENES ##
#' @keywords internal
## helper function for global settings
f_metaGeneDefinition <- function(selection = "Settings")
{
predefnum_bins <- 301 ## 151
## TWO point scaling
inner_margin <- 500 ## 500
right_outer_margin <- 1010 ## 1020
left_outer_margin <- 2010 ## 2020
gene_body <- 2000 ## 2000
totalGeneLength <- gene_body + 2 * inner_margin
inner_margin2 <- totalGeneLength - inner_margin
break_points_2P <- c(-2000, 0, inner_margin,
gene_body + inner_margin, totalGeneLength,
totalGeneLength + 1000)
total_output_gene_length <- sum(left_outer_margin,
inner_margin, gene_body, inner_margin,
right_outer_margin)
estimated_bin_size_2P <- total_output_gene_length/predefnum_bins
## ONE point scaling
up_downStream <- 4020
predefnum_bins_1P <- 201 ###binsize=20
break_points <- c(-2000, -1000, -500, 0, 500, 1000, 2000)
### %%% need to use this for one point scaling estimated bin size....
estimated_bin_size_1P <- up_downStream/predefnum_bins_1P
if (selection == "Settings") {
settings <- NULL
settings$break_points_2P <- break_points_2P
settings$estimated_bin_size_2P <- estimated_bin_size_2P
settings$break_points <- break_points
settings$estimated_bin_size_1P <- estimated_bin_size_1P
settings$predefnum_bins <- predefnum_bins
settings$inner_margin <- inner_margin
settings$right_outer_margin <- right_outer_margin
settings$left_outer_margin <- left_outer_margin
settings$gene_body <- gene_body
settings$up_downStream <- up_downStream
settings$predefnum_bins_1P <- predefnum_bins_1P
return(settings)
}
data(classesDefList, package = "ChIC.data", envir = environment())
if (selection == "Hlist") {
## GLOBAL VARIABLES
#Hlist <- c("H3K36me3", "POLR2A", "H3K4me3", "H3K79me2", "H4K20me1",
# "H2AFZ", "H3K27me3", "H3K9me3", "H3K27ac", "POLR2AphosphoS5",
# "H3K9ac", "H3K4me2", "H3K9me1", "H3K4me1", "POLR2AphosphoS2",
# "H3K79me1", "H3K4ac", "H3K14ac", "H2BK5ac", "H2BK120ac",
# "H2BK15ac", "H4K91ac", "H4K8ac", "H3K18ac", "H2BK12ac",
# "H3K56ac", "H3K23ac", "H2AK5ac", "H2BK20ac", "H4K5ac",
# "H4K12ac", "H2A.Z", "H3K23me2", "H2AK9ac", "H3T11ph")
##'POLR3G'
Hlist <- classesDefList$Hlist
return(Hlist)
}
if (selection == "TFlist") {
## GLOBAL VARIABLES
#Hlist <- c("H3K36me3", "POLR2A", "H3K4me3", "H3K79me2", "H4K20me1",
# "H2AFZ", "H3K27me3", "H3K9me3", "H3K27ac", "POLR2AphosphoS5",
# "H3K9ac", "H3K4me2", "H3K9me1", "H3K4me1", "POLR2AphosphoS2",
# "H3K79me1", "H3K4ac", "H3K14ac", "H2BK5ac", "H2BK120ac",
# "H2BK15ac", "H4K91ac", "H4K8ac", "H3K18ac", "H2BK12ac",
# "H3K56ac", "H3K23ac", "H2AK5ac", "H2BK20ac", "H4K5ac",
# "H4K12ac", "H2A.Z", "H3K23me2", "H2AK9ac", "H3T11ph")
##'POLR3G'
TFlist <- classesDefList$TF
return(TFlist)
}
if (selection == "Classes") {
## helper functions to define chromating
allSharp <- classesDefList$allSharp
# c("H3K27ac", "H3K9ac", "H3K14ac", "H2BK5ac", "H4K91ac",
# "H3K18ac", "H3K23ac", "H2AK9ac", "H3K4me3", "H3K4me2", "H3K79me1",
# "H2AFZ", "H2A.Z", "H4K12ac", "H4K8ac", "H3K4ac", "H2BK12ac",
# "H4K5ac", "H2BK20ac", "H2BK120ac", "H2AK5ac", "H2BK15ac")
allBroad <- classesDefList$allBroad
#c("H3K23me2", "H3K9me2", "H3K9me3", "H3K27me3", "H4K20me1",
# "H3K36me3", "H3K56ac", "H3K9me1", "H3K79me2", "H3K4me1", "H3T11ph")
## USE ONLY POLR2A for Pol2 class
RNAPol2 <- classesDefList$RNAPol2
final <- list(allSharp = allSharp,
allBroad = allBroad,
RNAPol2 = RNAPol2)
return(final)
}
}
#' @keywords internal
## helper function to check if annotationID is valid
f_annotationCheck <- function(annotationID)
{
checkMe <- ((annotationID == "hg19") |
(annotationID == "mm9")| (annotationID == "dm3")
| (annotationID == "mm10")| (annotationID == "hg38"))
if (is.character(annotationID) & checkMe)
{
message("\n",annotationID, " valid annotation...")
}else{
warning("annotationID not valid. Setting it back to default value
(hg19). Currently supported annotations are hg19,
hg38, mm9 and mm10.")
annotationID <- "hg19"
}
return(annotationID)
}
#' @keywords internal
## helper function to check if annotationID is valid
f_annotationLoad <- function(annotationID)
{
message("Load gene annotation")
## require(ChIC.data)
if (annotationID == "hg19") {
# hg19_refseq_genes_filtered_granges=NULL
data("hg19_refseq_genes_filtered_granges",
package = "ChIC.data", envir = environment())
annotObject <- hg19_refseq_genes_filtered_granges
}
if (annotationID == "hg38") {
# hg19_refseq_genes_filtered_granges=NULL
data("hg38_refseq_genes_filtered_granges",
package = "ChIC.data", envir = environment())
annotObject <- hg38_refseq_genes_filtered_granges
}
if (annotationID == "mm9") {
# hg19_refseq_genes_filtered_granges=NULL
data("mm9_refseq_genes_filtered_granges",
package = "ChIC.data", envir = environment())
annotObject <- mm9_refseq_genes_filtered_granges
}
if (annotationID == "mm10") {
# hg19_refseq_genes_filtered_granges=NULL
data("mm10_refseq_genes_filtered_granges",
package = "ChIC.data", envir = environment())
annotObject <- mm10_refseq_genes_filtered_granges
}
if (annotationID == "dm3") {
# hg19_refseq_genes_filtered_granges=NULL
data("dm3_refseq_genes_filtered_granges",
package = "ChIC.data", envir = environment())
annotObject <- dm3_refseq_genes_filtered_granges
}
return(annotObject)
}
#' @keywords internal
## helper function to check if annotationID is valid
f_chromInfoLoad <- function(annotationID)
{
## load chrom_info
##message("load chrom_info")
if (annotationID == "hg19") {
# hg19_chrom_info=NULL
data("hg19_chrom_info", package = "ChIC.data", envir = environment())
chromInfo <- hg19_chrom_info
}
if (annotationID == "hg38") {
# hg19_chrom_info=NULL
data("hg38_chrom_info", package = "ChIC.data", envir = environment())
chromInfo <- hg38_chrom_info
}
if (annotationID == "mm9") {
# hg19_chrom_info=NULL
data("mm9_chrom_info", package = "ChIC.data", envir = environment())
chromInfo <- mm9_chrom_info
}
if (annotationID == "mm10") {
# hg19_chrom_info=NULL
data("mm10_chrom_info", package = "ChIC.data", envir = environment())
chromInfo <- mm10_chrom_info
}
if (annotationID == "dm3") {
# hg19_chrom_info=NULL
data("dm3_chrom_info", package = "ChIC.data", envir = environment())
chromInfo <- dm3_chrom_info
}
return(chromInfo)
}
#' @keywords internal
##helper function to check if chromosome names contain "chr"
f_checkOfChrNames = function( data )
{
checkOfChr <- (grep( "chr", names( data$tags )))
if ( length(checkOfChr) < 1L)
{
newnames <- paste("chr", names(data$tags), sep="")
names(data$tags) <- newnames
names(data$quality) <- newnames
}
return(data)
}
#' @keywords internal
## masked_t.get.gene.av.density(smoothed.densityInput,
## gdl=annotatedGenesPerChr,mc=mc)
masked_t.get.gene.av.density <- function(chipTags_current, gdl, mc = 1)
{
settings <- NULL
settings <- f_metaGeneDefinition(selection = "Settings")
message("\nloading metaGene settings")
## print(str(settings))
result <- f_t.get.gene.av.density(chipTags_current,
gdl = gdl, im = settings$inner_margin,
lom = settings$left_outer_margin,
rom = settings$right_outer_margin, bs = settings$gene_body,
nbins = settings$predefnum_bins, mc = mc)
return(result)
}
## binnedInput_TSS <- masked_getGeneAvDensity_TES_TSS
## (smoothed.densityInput,gdl=annotatedGenesPerChr,mc=mc,tag='TSS')
#' @keywords internal
masked_getGeneAvDensity_TES_TSS <- function(smoothed.density, gdl,
tag = "TSS", mc = 1)
{
settings <- NULL
settings <- f_metaGeneDefinition(selection = "Settings")
message("\nloading metaGene settings")
## print(str(settings))
if (tag == "TSS") {
result <- f_t.get.gene.av.density_TSS(tl_current = smoothed.density,
gdl = gdl,
m = (settings$up_downStream / 2),
nbins = settings$predefnum_bins_1P,
separate.strands = FALSE,
mc = mc)
message("TSS")
} else {
result <- f_t.get.gene.av.density_TES(tl_current = smoothed.density,
gdl = gdl,
m = (settings$up_downStream / 2),
nbins = settings$predefnum_bins_1P,
separate.strands = FALSE,
mc = mc)
message("TES")
}
return(result)
}
#' @keywords internal
## MODIFIED VERSION OF
## t.get.gene.av.density for TWO.POINT a function for getting
## bin-averaged profiles for individual genes
## TWO.POINT t.get.gene.av.density <-
## function(chipTags_current,gdl=annotatedGenesPerChr, im=500,lom=5e3, rom=1e3,
## bs=2e3, nbins=400,separate.strands=F) {
## min.feature.size=min.feature.sizeMy=3000 f_t.get.gene.av.density <-
## function(chipTags_current, gdl=annotatedGenesPerChr, im=inner_margin,
## lom=left_outer_margin, rom=right_outer_margin, bs=gene_body,
## nbins=predefnum_bins,separate.strands=F, min.feature.size=3000,mc=1) {
## f_t.get.gene.av.density <- function(chipTags_current,gdl, im=500, lom=2010,
## rom=1010, bs=2000, nbins=301,separate.strands=F, min.feature.size=3000,mc=1)
f_t.get.gene.av.density <- function(chipTags_current, gdl, im, lom,
rom, bs, nbins, separate.strands = FALSE, min.feature.size = 3000, mc = 1)
{
chrl <- names(gdl)
names(chrl) <- chrl
## lapply(chrl[chrl %in% names(chipTags_current$td)],function(chr) {
## BiocParallel::bplapply(chrl[chrl %in% names(chipTags_current$td)],
## BPPARAM = BiocParallel::MulticoreParam(workers = mc),
mclapply(chrl[chrl %in% names(chipTags_current$td)],
mc.preschedule = FALSE,
mc.cores = mc,
FUN = function(chr) {
# print(chr)
nsi <- gdl[[chr]]$strand == "-"
# print(length(nsi))
current_gene_names <- gdl[[chr]]$geneSymbol
if ((sum(!nsi) > 0)) {
## if ((sum(!nsi)>1)) {
px <- f_feature.bin.averages(chipTags_current$td[[chr]],
data.frame(s = gdl[[chr]]$txStart[!nsi],
e = gdl[[chr]]$txEnd[!nsi]),
lom = lom, rom = rom, im = im, bs = bs,
nbins = nbins, min.feature.size = min.feature.size,
nu.point.omit = FALSE)
rownames(px) <- current_gene_names[!nsi]
} else {
px <- NULL
}
if ((sum(nsi) > 0)) {
## if ((sum(nsi)>1)) {
nd <- chipTags_current$td[[chr]]
nd$x <- -1 * nd$x
nx <- f_feature.bin.averages(nd,
data.frame(s = -1 * gdl[[chr]]$txEnd[nsi],
e = -1 * gdl[[chr]]$txStart[nsi]),
lom = lom, rom = rom, im = im,
bs = bs, nbins = nbins,
min.feature.size = min.feature.size,
nu.point.omit = FALSE)
rownames(nx) <- current_gene_names[nsi]
} else {
nx <- NULL
}
if (separate.strands) {
return(p = px, n = nx)
} else {
return(rbind(px, nx))
}
})
}
#' @keywords internal
## MODIFIED VERSION OF t.get.gene.av.density FOR TSS a function for getting
## bin-averaged profiles for individual genes TSS ONE.POINT
## f_t.get.gene.av.density_TSS <- function(tl_current,
## gdl=annotatedGenesPerChr,m=up_downStream, nbins=predefnum_bins_1P,
## separate.strands=F,mc=1) {###binning of the frame
f_t.get.gene.av.density_TSS <- function(tl_current, gdl, m = 4020,
nbins = 201, separate.strands = FALSE, mc = 1)
{
## binning of the frame
chrl <- names(gdl)
names(chrl) <- chrl
## lapply(chrl[chrl %in% names(tl_current$td)],function(chr)
## BiocParallel::bplapply(chrl[chrl %in% names(tl_current$td)],
## BPPARAM = BiocParallel::MulticoreParam(workers = mc),
mclapply(chrl[chrl %in% names(tl_current$td)],
mc.preschedule = FALSE,
mc.cores = mc,
FUN = function(chr) {
nsi <- gdl[[chr]]$strand == "-"
current_gene_names <- gdl[[chr]]$geneSymbol
## if ((sum(!nsi)>0)) {
if ((sum(!nsi) > 0)) {
## px <- f_feature.bin.averages(tl_current$td[[chr]],
## data.frame(x=gdl[[chr]]$txStart[!nsi]),m=m,
## nbins=nbins,nu.point.omit=FALSE)
px <- f_feature.bin.averages(tl_current$td[[chr]],
data.frame(x = gdl[[chr]]$txStart[!nsi]),
m = m,
min.feature.size=NULL,
nbins = nbins, nu.point.omit = FALSE)
rownames(px) <- current_gene_names[!nsi]
} else {
px <- NULL
}
if ((sum(nsi) > 0)) {
## if ((sum(nsi)>0)) {
nd <- tl_current$td[[chr]]
nd$x <- -1 * nd$x
## nx <- f_feature.bin.averages(nd,data.frame
## (x=-1*gdl[[chr]]$txEnd[nsi]),m=m,nbins=nbins,
## nu.point.omit=FALSE)
nx <- f_feature.bin.averages(nd,
data.frame(x = -1 * gdl[[chr]]$txEnd[nsi]),
m = m,
min.feature.size=NULL,
nbins = nbins, nu.point.omit = FALSE)
rownames(nx) <- current_gene_names[nsi]
} else {
nx <- NULL
}
if (separate.strands) {
return(p = px, n = nx)
} else {
return(rbind(px, nx))
}
})
}
#' @keywords internal
## MODIFIED VERSION OF t.get.gene.av.density FOR TES a function for getting
## bin-averaged profiles for individual genes TES ONE.POINT
## f_t.get.gene.av.density_TES <- function(tl_current,gdl=annotatedGenesPerChr,
## m=up_downStream, nbins=predefnum_bins_1P,separate.strands=F,mc=1) {
f_t.get.gene.av.density_TES <- function(tl_current, gdl, m = 4020,
nbins = 201, separate.strands = FALSE, mc = 1)
{
chrl <- names(gdl)
names(chrl) <- chrl
## lapply(chrl[chrl %in% names(tl_current$td)],function(chr)
## BiocParallel::bplapply(chrl[chrl %in% names(tl_current$td)],
## BPPARAM = BiocParallel::MulticoreParam(workers = mc),
mclapply(chrl[chrl %in% names(tl_current$td)],
mc.preschedule = FALSE,
mc.cores = mc,
FUN = function(chr) {
nsi <- gdl[[chr]]$strand == "-"
current_gene_names <- gdl[[chr]]$geneSymbol
## if ((sum(!nsi)>0)) {
if ((sum(!nsi) > 0)) {
## f_feature.bin.averages <-
##function(dat,feat,nu.feat.omit=F, nu.point.omit=T,
## scaling=NULL, return.scaling=F, trim=0,
## min.feature.size=NULL, ... ) {
px <- f_feature.bin.averages(tl_current$td[[chr]],
data.frame(x = gdl[[chr]]$txEnd[!nsi]),
m = m,
min.feature.size=NULL,
nbins = nbins, nu.point.omit = FALSE)
rownames(px) <- current_gene_names[!nsi]
} else {
px <- NULL
}
## if ((sum(nsi)>1)) {
if ((sum(nsi) > 0)) {
nd <- tl_current$td[[chr]]
nd$x <- -1 * nd$x
nx <- f_feature.bin.averages(nd,
data.frame(x = -1 * gdl[[chr]]$txStart[nsi]),
m = m,
min.feature.size=NULL,
nbins = nbins, nu.point.omit = FALSE)
rownames(nx) <- current_gene_names[nsi]
} else {
nx <- NULL
}
if (separate.strands) {
return(p = px, n = nx)
} else {
return(rbind(px, nx))
}
})
}
#' @keywords internal
f_spotfunction <- function(dframe, breaks, tag)
{
# hotSpots=NULL###takes values at different predefined points
fframe <- dframe[rownames(dframe) %in% breaks, ]
rownames(fframe) <- c(paste("hotSpots", tag, breaks, sep = "_"))
return(fframe)
}
#' @keywords internal
f_spotfunctionNorm <- function(dframe, breaks, tag)
{
newframe <- data.frame(dframe)
Norm <- newframe[rownames(newframe) %in% breaks, ]
fframe <- data.frame(breaks, Norm)
rownames(fframe) <- c(paste("hotSpots", tag, breaks, sep = "_"))
fframe$breaks <- NULL
return(fframe)
}
#' @keywords internal
f_maximaAucfunction <- function(dframe, breaks, estBinSize, tag)
{
## local maxima and area in all the predefined regions
## settings=f_metaGeneDefinition(selection='Settings')
## estimated_bin_size_2P=settings$estimated_bin_size_2P
dframe <- as.data.frame(dframe)
localMaxima_auc <- lapply(seq(length(breaks) - 1), FUN = function(i) {
xi <- breaks[i]
xj <- breaks[i + 1]
sub_set <- dframe[which((as.numeric(rownames(dframe)) <= xj) &
(as.numeric(rownames(dframe)) >= xi)), ]
l1 <- max(sub_set$Chip)
l2 <- max(sub_set$Input)
chip_frame <- cbind(i, "chip",
rownames(sub_set[which(sub_set$Chip == l1), ]), l1,
sum((sub_set$Chip) * estBinSize))
input_frame <- cbind("input",
rownames(sub_set[which(sub_set$Input == l2), ]), l2,
sum((sub_set$Input) * estBinSize))
tt <- (cbind(chip_frame, input_frame))
return(tt)
})
if (tag == "geneBody") {
fframe <- data.frame(matrix(unlist(localMaxima_auc),
nrow = 5, byrow = TRUE))
} else {
fframe <- data.frame(matrix(unlist(localMaxima_auc),
nrow = 6, byrow = TRUE))
}
colnames(fframe) <- c("i", "chip", "chipX", "chipY", "chipAUC",
"input", "inputX", "inputY", "inputAUC")
## save x-coordiante for max value for Chip and Input
xFrame <- data.frame(as.numeric(as.character(fframe$chipX)),
as.numeric(as.character(fframe$inputX)))
colnames(xFrame) <- c("Chip", "Input")
rownames(xFrame) <- paste("localMax", tag, fframe$i, "x", sep = "_")
## save y-coordiante for max value for Chip and Input
yFrame <- data.frame(as.numeric(as.character(fframe$chipY)),
as.numeric(as.character(fframe$inputY)))
colnames(yFrame) <- c("Chip", "Input")
rownames(yFrame) <- paste("localMax", tag, fframe$i, "y", sep = "_")
## save auc for Chip and Input
aucFrame <- data.frame(as.numeric(as.character(fframe$chipAUC)),
as.numeric(as.character(fframe$inputAUC)))
colnames(aucFrame) <- c("Chip", "Input")
rownames(aucFrame) <- paste("auc", tag, fframe$i, sep = "_")
finalReturn <- NULL
finalReturn <- data.frame(rbind(xFrame, yFrame, aucFrame))
return(finalReturn)
}
#' @keywords internal
f_maximaAucfunctionNorm <- function(dframe, breaks, estBinSize, tag)
{
newframe <- as.data.frame(dframe)
newframe$x <- rownames(newframe)
colnames(newframe) <- c("Norm", "break")
localMaxima_auc <- lapply(seq(length(breaks) - 1), FUN = function(i) {
# print(i)
xi <- breaks[i]
xj <- breaks[i + 1]
sub_set <- newframe[which((as.numeric(rownames(newframe)) <= xj) &
(as.numeric(rownames(newframe)) >= xi)), ]
l1 <- max(sub_set$Norm)
norm_frame <- cbind(i, "norm",
rownames(sub_set[which(sub_set$Norm == l1), ]),
l1, sum((sub_set$Norm) * estBinSize))
return(norm_frame)
})
if (tag == "geneBody") {
fframe <- data.frame(matrix(unlist(localMaxima_auc),
nrow = 5, byrow = TRUE))
} else {
fframe <- data.frame(matrix(unlist(localMaxima_auc),
nrow = 6, byrow = TRUE))
}
colnames(fframe) <- c("i", "norm", "X", "Y", "AUC")
## save x-coordiante for max value for Chip and Input
xFrame <- data.frame(as.numeric(as.character(fframe$X)))
colnames(xFrame) <- c("Norm")
rownames(xFrame) <- paste("localMax", tag, fframe$i, "x", sep = "_")
## save y-coordiante for max value for Chip and Input
yFrame <- data.frame(as.numeric(as.character(fframe$Y)))
colnames(yFrame) <- c("Norm")
rownames(yFrame) <- paste("localMax", tag, fframe$i, "y", sep = "_")
## save auc for Chip and Input
aucFrame <- data.frame(as.numeric(as.character(fframe$AUC)))
colnames(aucFrame) <- c("Norm")
rownames(aucFrame) <- paste("auc", tag, fframe$i, sep = "_")
finalReturn <- NULL
finalReturn <- data.frame(rbind(xFrame, yFrame, aucFrame))
return(finalReturn)
}
#' @keywords internal
## calculating variance, sd and qartiles of the value ditribution in different
## intervals
f_variabilityValues <- function(dframe, breaks, tag) {
#variabilityValues <- lapply(breaks[1:3], FUN = function(start) {
variabilityValues <- lapply(breaks[seq_len(3)], FUN = function(start) {
#print(start)
end <- abs(start)
sub_set <- dframe[which((as.numeric(rownames(dframe)) <= end) &
(as.numeric(rownames(dframe)) >= start)), ]
sub_set <- data.frame(sub_set)
name <- paste("dispersion", tag, start, "variance", sep = "_")
varI <- var(sub_set$Input)
varC <- var(sub_set$Chip)
name <- c(name, paste("dispersion", tag, start, "sd", sep = "_"))
sdI <- sd(sub_set$Input)
sdC <- sd(sub_set$Chip)
#valueFrameI <- data.frame(quantile(sub_set$Input)[1:4])
valueFrameI <- data.frame(quantile(sub_set$Input)[seq_len(4)])
colnames(valueFrameI) <- c("value")
#valueFrameC <- data.frame(quantile(sub_set$Chip)[1:4])
valueFrameC <- data.frame(quantile(sub_set$Chip)[seq_len(4)])
colnames(valueFrameC) <- c("value")
name <- c(name, paste("dispersion", tag, start,
rownames(valueFrameI), sep = "_"))
back <- data.frame(cbind(c(varI, sdI, valueFrameI$value),
c(varC, sdC, valueFrameC$value)))
colnames(back) <- c("Input", "Chip")
rownames(back) <- c(name)
return(back)
})
return(variabilityValues)
}
#' @keywords internal
# calculating variance, sd and qartiles of the value ditribution in different
# intervals
f_variabilityValuesNorm <- function(dframe, breaks, tag) {
newframe <- as.data.frame(dframe)
newframe$x <- rownames(newframe)
colnames(newframe) <- c("Norm", "break")
#variabilityValues <- lapply(breaks[1:3], FUN = function(start) {
variabilityValues <- lapply(breaks[seq_len(3)], FUN = function(start) {
#print(start)
end <- abs(start)
sub_set <- newframe[which((as.numeric(rownames(newframe)) <= end) &
(as.numeric(rownames(newframe)) >= start)), ]
sub_set <- data.frame(sub_set)
name <- paste("dispersion", tag, start, "variance", sep = "_")
var <- var(sub_set$Norm)
name <- c(name, paste("dispersion", tag, start, "sd", sep = "_"))
sd <- sd(sub_set$Norm)
#valueFrame <- data.frame(quantile(sub_set$Norm)[1:4])
valueFrame <- data.frame(quantile(sub_set$Norm)[seq_len(4)])
colnames(valueFrame) <- c("value")
name <- c(name, paste("dispersion", tag, start,
rownames(valueFrame), sep = "_"))
back <- data.frame(cbind(c(var, sd, valueFrame$value)))
colnames(back) <- c("Norm")
rownames(back) <- c(name)
return(back)
})
return(variabilityValues)
}
#####################################################################
######################### FUNCTIONS comparison with compendium
#####################################################################
#' @keywords internal
## helper function to load profiles from ChIC.data
f_loadDataCompendium <- function(endung, target, tag)
{
# compendium_profiles=ChIC.data::compendium_profiles
if (tag == "geneBody") {
name <- paste(target, "_", "TWO", endung, sep = "")
} else {
name <- paste(target, "_", tag, endung, sep = "")
}
#load profiles
if (target %in% f_metaGeneDefinition("Hlist")){
data("compendium_profiles",
package = "ChIC.data",
envir = environment())
frame=compendium_profiles[[name]]
}else{
data("compendium_profiles_TF",
package = "ChIC.data",
envir = environment())
frame=compendium_profiles_TF[[name]]
}
return(frame)
}
#' @keywords internal
## helper function to prepare dataframe
f_prepareData <- function(fmean, frame)
{
finalframe <- cbind(fmean$x, frame)
rownames(finalframe) <- NULL
finalframe <- as.data.frame(finalframe)
colnames(finalframe) <- c("x", "mean")
return(finalframe)
}
#' @keywords internal
## plot profiles compendium versus current dataset
f_plotProfiles <- function(meanFrame, currentFrame, endung = "geneBody",
absoluteMinMax, maintitel = "title", ylab = "mean of log2 read density",
savePlotPath = NULL)
{
message("Load settings")
settings <- f_metaGeneDefinition(selection = "Settings")
if (!is.null(savePlotPath)) {
filename <- paste(maintitel, ".pdf", sep = "")
pdf(file = file.path(savePlotPath, filename), width = 10, height = 7)
}
break_points_2P <- settings$break_points_2P
break_points <- settings$break_points
## The standard error of the mean (SEM) is the standard deviation of the
## sample-mean's estimate of a population mean.
## (It can also be seen as the standard deviation of the error in the
## sample mean with respect to the true
## mean, since the sample mean is an unbiased estimator.) SEM is usually
## estimated by the sample estimate of the population
## standard deviation (sample standard deviation) divided by the
## square root of the sample size (assuming
## statistical independence of the values in the sample)
plot(x = c(min(meanFrame$x), max(meanFrame$x)),
y = c(absoluteMinMax[1], absoluteMinMax[2]),
type = "n", xlab = "metagene coordinates", ylab = ylab,
main = maintitel,
xaxt = "n")
polygon(x = c(meanFrame$x, rev(meanFrame$x)),
y = c(meanFrame$mean + 2 * meanFrame$sderr,
rev(meanFrame$mean)), col = "lightblue", border = NA)
polygon(x = c(meanFrame$x, rev(meanFrame$x)),
y = c(meanFrame$mean - 2 * meanFrame$sderr,
rev(meanFrame$mean)), col = "lightblue", border = NA)
lines(x = meanFrame$x, y = meanFrame$mean, col = "black", lwd = 2)
lines(x = currentFrame$x, y = currentFrame$mean, col = "red", lwd = 2)
if (endung == "geneBody") {
currBreak_points <- break_points_2P[c(-2, -5)]
##c(-2000,500,2500,4000)
abline(v = c(break_points_2P[c(2, 5)]), lty = 2,
col = "darkgrey", lwd = 3)
abline(v = currBreak_points, lty = 3,
col = "darkgrey", lwd = 2)
axis(side = 1, at = break_points_2P,
labels = c("-2KB", "TSS", "TSS+500", "TES-500", "TES", "+1KB"))
} else {
TSSbreak_points <- break_points[-c(4)]
# c(-2000,-1000,-500,500,1000,2000)
if (endung == "TSS") {
axis(side = 1, at = break_points,
labels=c("-2KB","-1KB", "-500", "TSS", "+500", "+1KB", "+2KB"))
} else {
axis(side = 1, at = break_points,
labels=c("-2KB","-1KB", "-500", "TES", "+500", "+1KB", "+2KB"))
}
abline(v = 0, lty = 2, col = "darkgrey", lwd = 2)
abline(v = TSSbreak_points, lty = 3, col = "darkgrey", lwd = 2)
}
legend("topleft", legend = c("mean", "2*stdErr"),
fill = c("black", "lightblue"),
bg = "white")
if (!is.null(savePlotPath)) {
dev.off()
}
}
#' @keywords internal
## helper function to get binding class
f_getBindingClass <- function(target) {
allChrom <- f_metaGeneDefinition("Classes")
if (target %in% allChrom$allSharp) {
profileSet <- allChrom$allSharp
tag <- "(Sharp class)"
}
if (target %in% allChrom$allBroad) {
profileSet <- allChrom$allBroad
tag <- "(Broad class)"
}
if (target %in% allChrom$RNAPol2) {
profileSet <- allChrom$RNAPol2
tag <- "(RNAPol2 class)"
}
return(list(profileSet = profileSet, tag = tag))
}
#' @keywords internal
## density plot for QC-value distribution versus a single value
f_plotValueDistribution <- function(compendium, title, coordinateLine,
savePlotPath = NULL)
{
if (!is.null(savePlotPath)) {
filename <- file.path(savePlotPath, "PlotValueDistribution.pdf")
pdf(file = filename, width = 10, height = 7)
}
mycolors <- c("lightsteelblue1", "lightsteelblue3")
maxi <- max(compendium)
mini <- min(compendium)
## get density
d <- density(compendium)
plot(d, main = title, xlim = c(mini, maxi))
median <- median(compendium)
qq <- quantile(compendium, probs = c(0.05, 0.25, 0.5, 0.75, 0.95))
i <- d$x >= (qq[1]) & d$x <= (qq[2])
lines(d$x, d$y)
polygon(c(qq[1], d$x[i], qq[2]), c(0, d$y[i], 0), col = mycolors[1])
i <- d$x >= (qq[2]) & d$x <= (qq[3])
lines(d$x, d$y)
polygon(c(qq[2], d$x[i], qq[3]), c(0, d$y[i], 0), col = mycolors[2])
i <- d$x >= (qq[3]) & d$x <= (qq[4])
lines(d$x, d$y)
polygon(c(qq[3], d$x[i], qq[4]), c(0, d$y[i], 0), col = mycolors[2])
i <- d$x >= (qq[4]) & d$x <= (qq[5])
lines(d$x, d$y)
polygon(c(qq[4], d$x[i], qq[5]), c(0, d$y[i], 0), col = mycolors[1])
abline(v = coordinateLine, , col = "red", lwd = 3, lty = 2)
if (!is.null(savePlotPath)) {
dev.off()
}
}
#' @keywords internal
## helper function to select the random forest model for the respective
## chromatinmark or TF
f_getPredictionModel <- function(id) {
# library(randomForest)
allChrom <- f_metaGeneDefinition("Classes")
data("rf_models", package = "ChIC.data", envir = environment())
if (id %in% f_metaGeneDefinition("Hlist")) {
message("Load chromatinmark model")
if (id %in% allChrom$allSharp) {
model <- rf_models[["sharpEncode"]]
}
if (id %in% allChrom$allBroad) {
model <- rf_models[["broadEncode"]]
}
if (id %in% allChrom$RNAPol2) {
model <- rf_models[["RNAPol2Encode"]]
}
if (id == "H3K9me3") {
model <- rf_models[["H3K9Encode"]]
}
if (id == "H3K27me3") {
model <- rf_models[["H3K27Encode"]]
}
if (id == "H3K36me3") {
model <- rf_models[["H3K36Encode"]]
}
} else if ((id %in% f_metaGeneDefinition("TFlist")) | (id== "TF"))
{
message("Load TF model")
model <- rf_models$TFmodel
} else {
message(id, "not found")
model=NULL
}
return(model)
}
#' @keywords internal
## helper function that converts frame with chip and normalized values to one
## column frame to further be procecced.
f_convertframe <- function(oldframe) {
values <- c(oldframe$Chip, oldframe$Norm)
newframe <- data.frame(values)
nn <- c(paste("chip", rownames(oldframe), sep = "_"),
paste("norm", rownames(oldframe),
sep = "_"))
rownames(newframe) <- nn
return(newframe)
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.