R/background.R
In PWMEnrich: PWM enrichment analysis

Documented in getBackgroundFrequencies getPromoters makeBackground makePriors makePWMCutoffBackground makePWMEmpiricalBackground makePWMGEVBackground makePWMLognBackground makePWMPvalCutoffBackground makePWMPvalCutoffBackgroundFromSeq makeStartEndPos motifEcdf .normalize.bg.seq

# Functions to create background distributions

#' check consistency of bg.seq input parameter
#'
#' @param bg.seq a set of background sequences, either a list of DNAString object or DNAStringSet object
.normalize.bg.seq = function(bg.seq){
	# check if the sequences are in the right format
	if(!inherits(bg.seq, "DNAStringSet")) {
		if(is.list(bg.seq)) {
			if(length(bg.seq) > 0 && ! inherits(bg.seq[[1]], "DNAString"))
				stop("bg.seq needs to be a list of DNAString objects or a DNAStringSet object")
		} else {
			stop("bg.seq needs to be a list of DNAString objects or a DNAStringSet object")
		}
	} #else {
		#bg.seq = DNAStringSetToList(bg.seq)
	#}
	
	return(bg.seq)
}

#' Make priors from background sequences
#'
#' These priors serve both as background nucleotide frequencies and pseudo-counts
#' for PWMs. 
#'
#' @param bg.seq a set of background sequences
#' @param bg.pseudo.count the total pseudocount shared between nucleotides
#' @export
#' @examples
#' # some example sequences
#' sequences = list(DNAString("AAAGAGAGTGACCGATGAC"), DNAString("ACGATGAGGATGAC"))
#' # make priors with pseudo-count of 1 shared between them
#' makePriors(sequences, 1)
makePriors = function(bg.seq, bg.pseudo.count){
	bg.seq = .normalize.bg.seq(bg.seq)

	# start with the count of the first sequence
	acgt.count = alphabetFrequency(bg.seq[[1]])[c("A", "C", "G", "T")]
	acgt.count = acgt.count[c("A", "C", "G", "T")] + acgt.count[c("T", "G", "C", "A")]
	# add ACGT counts for the rest
	if(length(bg.seq) > 1 ){
		for(i in 2:length(bg.seq)){
			cont = alphabetFrequency(bg.seq[[i]])[c("A", "C", "G", "T")]
			# add the counts on the other strand
			cont[c("A", "C", "G", "T")] = cont[c("A", "C", "G", "T")] + cont[c("T", "G", "C", "A")]
			# sum up
			acgt.count = acgt.count + cont
		}
	}
	
	# scale to bg.pseudo.count
	prior = (acgt.count / sum(acgt.count)) * bg.pseudo.count

	return(prior)	
}


#' Make a lognormal background distribution
#'
#' Construct a lognormal background distribution for a set of sequences. 
#' Sequences concatenated are binned in 'bg.len' chunks and lognormal distribution 
#' fitted to them. 
#'
#' @param bg.seq a set of background sequences, either a list of DNAString object or DNAStringSet object
#' @param motifs a set of motifs, either a list of frequency matrices, or a list of PWM objects. If
#'               frequency matrices are given, the background distribution is fitted from bg.seq. 
#' @param bg.pseudo.count the pseudo count which is shared between nucleotides when frequency matrices are given
#' @param bg.len background sequences will be split into tiles of this length (default: 250bp)
#' @param bg.len.sizes background tiles will be joined into bigger tiles containing this much smaller tiles. 
#'                     The default is \code{2^(0:4)}, which with \code{bg.len} translates into 
#'                     250bp, 500bp, 1000bp, 1500bp, 2000bp, 4000bp. Note this is only used in the "human" algorithm.
#' @param bg.source a free-form textual description of how the background was generated
#' @param verbose if to produce verbose output
#' @param algorithm type of algorithm to use, valid values are: "default" and "human".
#' @export
#' @examples
#' \dontrun{
#' if(requireNamespace("PWMEnrich.Dmelanogaster.background")){
#'    data(MotifDb.Dmel.PFM, package = "PWMEnrich.Dmelanogaster.background")
#'
#'    # make background for MotifDb motifs using 2kb promoters of all D. melanogaster transcripts 
#' 	  if(requireNamespace("BSgenome.Dmelanogaster.UCSC.dm3")) 
#'      makePWMLognBackground(Dmelanogaster$upstream2000, MotifDb.Dmel.PFM)
#' }
#' }
makePWMLognBackground = function(bg.seq, motifs, bg.pseudo.count=1, bg.len=250, bg.len.sizes=2^(0:4), bg.source="", verbose=TRUE, algorithm="default"){
	# check if the sequences are in the right format
	bg.seq = .normalize.bg.seq(bg.seq)

	# convert to list if a single motif is given
	if(!is.list(motifs))
		motifs = list(motifs)
		
	if(bg.len.sizes[1] != 1)
		stop("First value of 'bg.len.sizes' needs to be 1")
		
	if(!(algorithm %in% c("default", "human")))
		stop("Parameter 'algorithm' needs to be one of: ['default', 'human']")
		
	cat("NOTE: Using the '", 	algorithm, "' algorithm to infer background parameters,\n      ", sep="")
	if(algorithm == "default"){
		cat("appropriate for all organisms except human.\n")
	} else {
		cat("appropriate only for human data.\n")
	}

	# concatenate all the background sequences into a single long sequence
	bg.seq.all = concatenateSequences(bg.seq)
	# generate start and end positions
	bg.seq.start = seq(1, nchar(bg.seq.all)+1, bg.len)
	bg.seq.end = bg.seq.start - 1
	bg.seq.start = bg.seq.start[1:(length(bg.seq.start)-1)]
	bg.seq.end = bg.seq.end[2:length(bg.seq.end)]
	
	# split into bg.len sequences
	bg = DNAStringSet(bg.seq.all, start=bg.seq.start, end=bg.seq.end)
	
	if(length(bg)<1000)
		warnings(paste("The number of chunks of size", bg.len, "is smaller than 1000. This might lead to an non-robust estimate of lognormal distribution parameters."))
	
	# convert motifs to PWM format if neccessary
	if(!inherits(motifs[[1]], "PWM")) {
		prior = makePriors(list(DNAString(bg.seq.all)), bg.pseudo.count)
		pwms = PFMtoPWM(motifs, prior.params = prior)
	} else {
		pwms = motifs
	}
	
	# finally, do motif scanning and calculate mean and sd
	bg.res = motifScores(bg, pwms, verbose=verbose)
	
	# pwm lengths and base lengths
	pwm.len = sapply(pwms, length)
	bg.len.real = bg.len - pwm.len + 1
	
	# do a robust estimate of parameters

	# dnorm with left-censored data and known meanlog
	#
	# x value for which dnorm is needed
	# cen logical vector if the value is censored
	# meanlog meanlog parameter
	# sdlog sdlog parameter
	dlnorm.lcen = function(x, cen, meanlog, sdlog){
		if(sdlog <= 0)
			-Inf
		xn = x[!cen]
		xc = x[cen]

		sum(dlnorm(xn, meanlog, sdlog, log=TRUE)) + sum(plnorm(xc, meanlog, sdlog, lower.tail=TRUE, log.p=TRUE))
	}

	# Negative log-likelihood of left-censored data
	#
	# NOTE: this function assumes xc, cen and meanlog and in environment
	#
	# p set of parameters (in this case only sdlog)
	ll.lcen = function(p){
		-dlnorm.lcen(xc, cen, meanlog, p)
	}	
	
	if(algorithm == "default"){
		# the simple default algorithm
		bg.mean = colMeans(bg.res)
		bg.sd = apply(bg.res, 2, sd)
		
	} else {	
		# a matrix of inferred values that will be averaged over
		bg.mean.mat = bg.sd.mat = matrix(0, nrow=length(bg.len.sizes), ncol=ncol(bg.res))
		colnames(bg.mean.mat) = colnames(bg.sd.mat) = colnames(bg.res)
		if(verbose){
			cat("Parameter estimation...\n")
		}

		# do estimation for different sizes of background
		for(size.inx in 1:length(bg.len.sizes)){
			# current size multiplier
			cur.mul = bg.len.sizes[size.inx]
		
			if(verbose){
				cat("Recording distribution properties for", bg.len * cur.mul, "bp tiles (this may take a while...)\n") 
			}
		
			# summarise bg.res for the current size		
			max.len = nrow(bg.res) %/% cur.mul * cur.mul
			group = (0:(max.len-1)) %/% cur.mul
		
			# group by the groups and do an average
			if(cur.mul == 1){
				bg.res.size = bg.res
			} else {
				# new implementation for faster grouping!
				out = matrix(0, nrow=length(unique(group)), ncol=ncol(bg.res))
				for(i in 1:nrow(out)){
					row.inx = (i-1)*cur.mul+1
					r = bg.res[row.inx,]
		
					for(j in 1:(cur.mul-1)){
						r = r + bg.res[row.inx+j,]
					}
					out[i,] = r / cur.mul
				}
			
				bg.res.size = out
			
				# double-check just in case !
				#bg.res.size = by(bg.res[1:max.len,], group, colMeans)
				#bg.res.size = do.call("rbind", as.list(bg.res.size))
			}
		
			# consistency check
			stopifnot(nrow(bg.res.size) == max.len/cur.mul)
		
			# do robust estimate for each motif
			for(i in 1:ncol(bg.res)){
				if(verbose){
					cat("Estimating parameters for motif", i, "/", ncol(bg.res), "\n")
				}
				x = bg.res.size[,i]
		
				# censor everything below the 75% quantile
				b = quantile(x, 0.75)

				# left-censor data
				xc = x
				xc[x<=b] = b
				cen = rep(FALSE, length(x))
				cen[x<=b] = TRUE

				# meanlog based on median
				meanlog = median(log(x))

				# optimize sdlog
				p = optimize(ll.lcen, c(1e-8, 1e5))
		
				# transform and save
				ml = meanlog
				sl = p$minimum
				mx = exp(ml + (sl^2)/2)
				sx = sqrt((exp(sl^2)-1)*exp(2*ml+sl^2))
	
				# save
				bg.mean.mat[size.inx, i] = mx
				bg.sd.mat[size.inx, i] = sx
			}
		}
			
		# record the values
		bg.mean = bg.mean.mat
		bg.sd = bg.sd.mat
		
		# record the sizes
		bg.len.real = outer(bg.len.sizes, bg.len.real)
		rownames(bg.len.real) = NULL
	}
	
	new("PWMLognBackground", bg.source=bg.source, bg.len=bg.len.real, bg.mean=bg.mean, bg.sd=bg.sd, pwms=pwms)
}


#' Make a cutoff background
#'
#' Make a background based on number of motifs hits above a certain threshold. 
#'
#' @param bg.seq a set of background sequences, either a list of DNAString object or DNAStringSet object
#' @param motifs a set of motifs, either a list of frequency matrices, or a list of PWM objects. If
#'               frequency matrices are given, the background distribution is fitted from bg.seq. 
#' @param cutoff the cutoff at which the background should be made, i.e. at which a motif hit is called significant
#' @param bg.pseudo.count the pseudo count which is shared between nucleotides when frequency matrices are given
#' @param bg.source a free-form textual description of how the background was generated
#' @param verbose if to produce verbose output
#' @export
#' @examples
#' \dontrun{
#' if(requireNamespace("PWMEnrich.Dmelanogaster.background")){
#'    data(MotifDb.Dmel.PFM, package = "PWMEnrich.Dmelanogaster.background")
#'
#'    # make background for MotifDb motifs using 2Kb promoters of all D. melanogaster transcripts 
#'    # using a cutoff of 5
#' 	  if(requireNamespace("BSgenome.Dmelanogaster.UCSC.dm3")) 
#'      makePWMCutoffBackground(Dmelanogaster$upstream2000, MotifDb.Dmel.PFM, cutoff=log2(exp(5)))
#' }
#' }
makePWMCutoffBackground = function(bg.seq, motifs, cutoff=log2(exp(4)), bg.pseudo.count=1, bg.source="", verbose=TRUE){
	# check if the sequences are in the right format
	bg.seq = .normalize.bg.seq(bg.seq)
	
	# convert to list if a single motif is given
	if(!is.list(motifs))
		motifs = list(motifs)
	
	# make priors and PWMs
	if(!inherits(motifs[[1]], "PWM")) {
		prior = makePriors(bg.seq, bg.pseudo.count)
		pwms = PFMtoPWM(motifs, prior.params = prior)
	} else {
		pwms = motifs
	}
	
	# scan with cutoff - this gives motif *hit counts*
	bg.res = motifScores(bg.seq, pwms, verbose=verbose, cutoff=cutoff)
	
	seq.len = sapply(bg.seq, length)
	pwm.len = sapply(pwms, length)
	
	# total length of the scanned sequences for each sequence
	total.len = sum(seq.len) - length(seq.len)*(pwm.len-1)
			
	# total number of hits
	total.hits = colSums(bg.res)
	
	# density of binding sites
	bg.P = total.hits / total.len
	
	new("PWMCutoffBackground", bg.source=bg.source, bg.cutoff=cutoff, bg.P=bg.P, pwms=pwms)
}

#' Make an empirical P-value background
#'
#' Make a background appropriate for empirical P-value calculation. The provided set of background
#' sequences is contcatenated into a single long sequence which is then scanned with the motifs
#' and raw scores are saved. This object can be very large. 
#'
#' For reliable P-value calculation the size of the background set needs to be at least seq.len / min.P.value.
#' For instance, to get P-values at a resolution of 0.001 for a single sequence of 500bp, we would need 
#' a background of at least 500/0.001 = 50kb. This ensures that we can make 1000 independent 500bp samples from
#' this background to properly estimate the P-value. For a group of sequences, we would take seq.len to be the
#' total length of all sequences in a group. 
#'
#' @param bg.seq a set of background sequences, either a list of DNAString object or DNAStringSet object
#' @param motifs a set of motifs, either a list of frequency matrices, or a list of PWM objects. If
#'               frequency matrices are given, the background distribution is fitted from bg.seq. 
#' @param bg.pseudo.count the pseudo count which is shared between nucleotides when frequency matrices are given
#' @param bg.source a free-form textual description of how the background was generated
#' @param verbose if to produce verbose output
#' @param ... currently unused (this is for convenience for makeBackground function)
#' @export
#' @examples
#' \dontrun{
#' if(requireNamespace("PWMEnrich.Dmelanogaster.background")){
#'    data(MotifDb.Dmel.PFM, package = "PWMEnrich.Dmelanogaster.background")
#'
#'    # make empirical background by saving raw scores for each bp in the sequence. This can be 
#'    # very large in memory!
#' 	  if(requireNamespace("BSgenome.Dmelanogaster.UCSC.dm3")) 
#'      makePWMEmpiricalBackground(Dmelanogaster$upstream2000[1:100], MotifDb.Dmel.PFM)
#' }
#' }
makePWMEmpiricalBackground = function(bg.seq, motifs, bg.pseudo.count=1, bg.source="", verbose=TRUE, ...){
	# check if the sequences are in the right format
	bg.seq = .normalize.bg.seq(bg.seq)
	
	# convert to list if a single motif is given
	if(!is.list(motifs))
		motifs = list(motifs)
	
	# make priors and PWMs
	if(!inherits(motifs[[1]], "PWM")) {
		prior = makePriors(bg.seq, bg.pseudo.count)
		pwms = PFMtoPWM(motifs, prior.params = prior)
	} else {
		pwms = motifs
	}
	
	# scan one very long sequence that is formed by concatenating all sequences
	bg.seq.all = DNAString(concatenateSequences(bg.seq))
	
	bg.res = motifScores(bg.seq.all, pwms, raw.scores=TRUE, verbose=verbose)
	
	bg.len = length(bg.seq.all)
	bg.fwd = bg.res[[1]][1:bg.len,]
	bg.rev = bg.res[[1]][(bg.len+1):(bg.len*2),]
	
	if(length(motifs) == 1){
		bg.fwd = matrix(bg.fwd, ncol=1, dimnames=list(NULL, names(motifs)))
		bg.rev = matrix(bg.rev, ncol=1, dimnames=list(NULL, names(motifs)))
	}
	
	new("PWMEmpiricalBackground", bg.source=bg.source, bg.fwd=bg.fwd, bg.rev=bg.rev, pwms=pwms)
}

#' Construct a cutoff background from empirical background
#'
#' This function takes already calculated empirical background distribution and chooses
#' cutoff for each motif based on P-value cutoff for individual sites. 
#'
#' @param bg.p an object of class PWMEmpiricalBackground
#' @param p.value the P-value used to find cuttoffs for each of the motifs
#' @param bg.source textual description of background source
#' @return an object of type PWMCutoffBackground
#' @export
#' @examples
#' \dontrun{
#' if(requireNamespace("PWMEnrich.Dmelanogaster.background")){
#'    data(MotifDb.Dmel.PFM, package = "PWMEnrich.Dmelanogaster.background")
#'
#'    # make empirical background - here we use only 100 sequences for illustrative purposes
#' 	  if(requireNamespace("BSgenome.Dmelanogaster.UCSC.dm3")) 
#'      bg.p = makePWMEmpiricalBackground(Dmelanogaster$upstream2000[1:100], MotifDb.Dmel.PFM)
#'
#'    # use the empirical background to pick a threshold and make cutoff background
#'    makePWMPvalCutoffBackground(bg.p, 0.001)
#' }
#' }
makePWMPvalCutoffBackground = function(bg.p, p.value=1e-3, bg.source=""){
	# extract stuffs
	bg.fwd = bg.p@bg.fwd
	bg.rev = bg.p@bg.rev
	pwms = bg.p@pwms
	
	bg.len = nrow(bg.fwd)
	
	if(bg.len * p.value < 1){
		stop("The P-value is too small for background of this size. Should have at least 1/p.value nucleotides in the background.")
	}
	
	# get cutoffs
	cutoff = structure(rep(0, length(pwms)), names=names(pwms))
	P = structure(rep(0, length(pwms)), names=names(pwms))
	for(i in 1:length(pwms)){
		# join both strands
		all.data = log2(c(bg.fwd[!is.na(bg.fwd[,i]),i], bg.rev[!is.na(bg.rev[,i]),i]))
		
		# stop if not all data is finite, because then there is an error!
		stopifnot(length(is.finite(all.data)) == length(all.data))
		
		# find out the cutoff for 1-p.value
		cutoff[i] = quantile(ecdf(all.data), 1 - p.value)
		# work out the actual P value
		# NOTE: since the distribution is decrete, this won't be exactly the same as 2*P.value but will be quite close
		P[i] = 2 * (sum(all.data >= cutoff[i]) / length(all.data))
	}

	# final object
	new("PWMCutoffBackground", bg.source=bg.source, bg.cutoff=cutoff, bg.P=P, pwms=pwms)
}

#' Construct a P-value cutoff background from a set of sequences
#'
#' This function creates a P-value cutoff background for motif enrichment. 
#'
#' @param bg.seq a set of background sequences, either a list of DNAString object or DNAStringSet object
#' @param motifs a set of motifs, either a list of frequency matrices, or a list of PWM objects. If
#'               frequency matrices are given, the background distribution is fitted from bg.seq. 
#' @param p.value the P-value used to find cuttoffs for each of the motifs
#' @param bg.pseudo.count the pseudo count which is shared between nucleotides when frequency matrices are given
#' @param bg.source textual description of background source
#' @param verbose if to print verbose output
#' @return an object of type PWMCutoffBackground
#' @export
#' @examples
#' \dontrun{
#' if(requireNamespace("PWMEnrich.Dmelanogaster.background")){
#'    data(MotifDb.Dmel.PFM, package = "PWMEnrich.Dmelanogaster.background")
#'
#'    # use the empirical background to pick a threshold and make cutoff background
#'    makePWMPvalCutoffBackground(Dmelanogaster$upstream2000, 0.001)
#' }
#' }
makePWMPvalCutoffBackgroundFromSeq = function(bg.seq, motifs, p.value=1e-3, bg.pseudo.count=1, bg.source="", verbose=TRUE){
	# check if the sequences are in the right format
	bg.seq = .normalize.bg.seq(bg.seq)
	
	# convert to list if a single motif is given
	if(!is.list(motifs))
		motifs = list(motifs)
	
	# make priors and PWMs
	if(!inherits(motifs[[1]], "PWM")) {
		prior = makePriors(bg.seq, bg.pseudo.count)
		pwms = PFMtoPWM(motifs, prior.params = prior)
	} else {
		pwms = motifs
	}
	
	# pre-calculate the big sequence motifScoresBigMemory() is faster
	seq.input = .inputParamSequences(bg.seq)
	seq.all = DNAString(concatenateSequences(seq.input))
	
	# get cutoffs
	cutoff = structure(rep(0, length(pwms)), names=names(pwms))
	P = structure(rep(0, length(pwms)), names=names(pwms))
	for(i in 1:length(pwms)){	
		# get the raw values for this motif only!
		bg.res = motifScoresBigMemory(bg.seq, pwms[i], verbose=verbose, raw.scores=TRUE, seq.all=seq.all)
		bg.res = na.omit(unlist(bg.res))
		
		# transform to log2 to get the final data
		all.data = log2(bg.res)
		
		# find out the cutoff for 1-p.value
		cutoff[i] = quantile(ecdf(all.data), 1 - p.value)
		# work out the actual P value
		# NOTE: since the distribution is decrete, this won't be exactly the same as 2*P.value but will be quite close
		P[i] = 2 * (sum(all.data >= cutoff[i]) / length(all.data))
	}

	# final object
	new("PWMCutoffBackground", bg.source=bg.source, bg.cutoff=cutoff, bg.P=P, pwms=pwms)
}

#' Divide total.len into fragments of length len by providing start,end positions
#'
#' @param total.len total available length to be subdivided
#' @param len size of the individual chunk
#' @return a data.frame containing paired up start,end positions
makeStartEndPos = function(total.len, len){
	# make start positions
	start = seq(1, total.len+1, len)
	# make end positions
	end = start - 1
	# pair them up
	start = start[1:(length(start)-1)]
	end = end[2:length(end)]
	
	data.frame(start, end)
}

#' Make a GEV background distribution
#'
#' Construct a lognormal background distribution for a set of sequences. 
#' Sequences concatenated are binned in 'bg.len' chunks and lognormal distribution 
#' fitted to them. 
#'
#' @param bg.seq a set of background sequences, either a list of DNAString object or DNAStringSet object
#' @param motifs a set of motifs, either a list of frequency matrices, or a list of PWM objects. If
#'               frequency matrices are given, the background distribution is fitted from bg.seq. 
#' @param bg.pseudo.count the pseudo count which is shared between nucleotides when frequency matrices are given
#' @param bg.len the length range of background chunks
#' @param bg.source a free-form textual description of how the background was generated
#' @param verbose if to produce verbose output
#' @param fit.log if to fit log odds (instead of odds)
#' @export
#' @examples
#' \dontrun{
#' if(requireNamespace("PWMEnrich.Dmelanogaster.background")){
#'    data(MotifDb.Dmel.PFM, package = "PWMEnrich.Dmelanogaster.background")
#'
#'    # make background for MotifDb motifs using 2kb promoters of all D. melanogaster transcripts 
#' 	  if(requireNamespace("BSgenome.Dmelanogaster.UCSC.dm3")) 
#'      makePWMGEVBackground(Dmelanogaster$upstream2000, MotifDb.Dmel.PFM)
#' }
#' }
makePWMGEVBackground = function(bg.seq, motifs, bg.pseudo.count=1, bg.len=seq(200,2000,200), bg.source="", verbose=TRUE, fit.log=TRUE){
	# check if the sequences are in the right format
	bg.seq = .normalize.bg.seq(bg.seq)

	# convert to list if a single motif is given
	if(!is.list(motifs))
		motifs = list(motifs)
		
	# give automatic names
	if(is.null(names(motifs))){
		names(motifs) = paste("Motif", 1:length(motifs), sep="")
	}


	# concatenate all the background sequences into a single long sequence
	bg.seq.all = concatenateSequences(bg.seq)
	
	# convert motifs to PWM format if neccessary
	if(!inherits(motifs[[1]], "PWM")) {
		prior = makePriors(list(DNAString(bg.seq.all)), bg.pseudo.count)
		pwms = PFMtoPWM(motifs, prior.params = prior)
	} else {
		pwms = motifs
	}
		
	# GEV parameters that need to be fitted in linear regression
	params = array(0, dim=c(length(bg.len), 3, length(pwms)), 
		dimnames=list(bg.len, c("loc", "scale", "shape"), names(pwms)))
	
	# iterate over background lengths and fit GEV distributions
	for(i in 1:length(bg.len)){
		if(verbose){
			message("Scanning background of size ", bg.len[i])
		}
		bg.pos = makeStartEndPos(nchar(bg.seq.all), bg.len[i])
		bg = DNAStringSet(bg.seq.all, start=bg.pos$start, end=bg.pos$end)
		
		# scan
		bg.res = motifScores(bg, pwms, verbose=verbose)
		
		# fit GEV for each PWM
		params[i,,] = apply(bg.res, 2, function(x) {
			if(fit.log)
				x = log(x)
			fgev(x, std.err=FALSE)$param[c("loc", "scale", "shape")]
		})
	}
	
	## now fit linear regression for each PWM
	
	bg.loc = lapply(names(pwms), function(pwm.name){
		loc = params[,"loc",pwm.name]
		log.len = log(bg.len)
		lm(loc~log.len)
	})
	names(bg.loc) = names(pwms)
	
	bg.scale = lapply(names(pwms), function(pwm.name){
		scale = params[,"scale",pwm.name]
		log.len = log(bg.len)
		lm(scale~log.len)
	})
	names(bg.scale) = names(pwms)
	
	bg.shape = lapply(names(pwms), function(pwm.name){
		shape = params[,"shape",pwm.name]
		log.len = log(bg.len)
		lm(shape~log.len)
	})
	names(bg.shape) = names(pwms)
	
	# return the object	
	new("PWMGEVBackground", bg.source=bg.source, bg.loc=bg.loc, bg.scale=bg.scale, bg.shape=bg.shape, pwms=pwms)
}

#' Get the promoter sequences either for a named organism such as "dm3" or a BSgenome object
#' 
#' @param organismOrGenome either organism name, e.g. "dm3", or BSgenome object
#' @return a list of: promoters - DNAStringSet of (unique) promoters; organism - name of species; version - genome version
getPromoters = function(organismOrGenome){
	org = organismOrGenome
	sek = NULL
	org.valid = c("dm3", "mm9", "hg19")
	
	# check if it's a valid UCSC name
	if(is.character(org)){
		if(org %in% org.valid){
			sel = org
		} else {
			stop(paste("Unrecognised organism name, valid values are:", paste(org.valid, collapse=", ")))
		}
	# check for a BSgenome object	
	} else if(is(org, "BSgenome")){
	  genome <- metadata(org)[["genome"]]
		if(genome %in% org.valid){
			sel = genome 
		} else {
			stop(paste("Promoter sequences cannot be retrieved automatically for ", genome, ", please provide a set of background sequences explicitely.", sep=""))
		}
	} else {
		stop("The input parameter needs to be a valid genome name ('dm3', 'mm9' or 'hg19') or a set of background sequences.")
	}
	
	e = new.env()
	# get the promoter sequences from the saved object
	if(sel == "dm3"){		
		if(requireNamespace("PWMEnrich.Dmelanogaster.background")){
			data("dm3.upstream2000", package = "PWMEnrich.Dmelanogaster.background", envir=e)
			promoters = e$dm3.upstream2000
			organism = "D. melanogaster"
			version = "dm3"
		} else {
			stop("This function requires the 'PWMEnrich.Dmelanogaster.background' package, please install it.")
		}
	} else if(sel == "mm9"){
		if(requireNamespace("PWMEnrich.Mmusculus.background")){
			data("mm9.upstream2000", package = "PWMEnrich.Mmusculus.background", envir=e)
			promoters = e$mm9.upstream2000
			organism = "M. musculus"
			version = "mm9"
		} else {
			stop("This function requires the 'PWMEnrich.Mmusculus.background' package, please install it.")
		}
	} else if(sel == "hg19"){
		if(requireNamespace("PWMEnrich.Hsapiens.background")){
			data("hg19.upstream2000", package = "PWMEnrich.Hsapiens.background", envir=e)
			promoters = e$hg19.upstream2000
			organism = "H. sapiens"
			version = "hg19"
		} else {
			stop("This function requires the 'PWMEnrich.Hsapiens.background' package, please install it.")
		}
	} else {
		stop("Internal error, should never reach this point!")
	}
	
	list(promoters=promoters, organism=organism, version=version)
}

#' Make a background for a set of position frequency matrices
#'
#' This is a convenience front-end function to compile new backgrounds for a set of PFMs. 
#' Currently only supports D. melanogaster, but in the future should support other common organisms as well. 
#'
#' @param motifs a list of position frequency matrices (4xL matrices)
#' @param organism either a name of the organisms for which the background should be compiled 
#'                 (currently supported names are "dm3", "mm9" and "hg19"), or a \code{BSgenome} object (see \code{BSgenome} package). 
#' @param type the type of background to be compiled. Possible types are: 
#'             \itemize{
#'                 \item "logn" - estimate a lognormal background
#'                 \item "cutoff" - estimate a Z-score background with fixed log-odds cutoff (in log2)
#'                 \item "pval" - estimate a Z-score background with a fixed P-value cutoff. Note that this may require a lot of memory
#'                                since the P-value of motif hits is first estimated from the empirical distribution. 
#'                 \item "empirical" - create an empirical P-value background. Note that this may require a lot of memory (up to 10GB in
#'                                     default "slow" mode (quick=FALSE) for 126 JASPAR motifs and 1000 D. melanogaster promoters).
#'                 \item "GEV" - estimate a generalized extreme value (GEV) distribution background by fitting linear regression to distribution 
#'                               parameters in log space
#'             }
#' @param quick if to preform fitting on a reduced set of 100 promoters. This will not give as good results but is much quicker than fitting to all the promoters (~10k). 
#'              Usage of this parameter is recommended only for testing and rough estimates. 
#' @param bg.seq a set of background sequences to use. This parameter overrides the "organism" and "quick" parameters.
#' @param ... other named parameters that backend function makePWM***Background functions take.
#' @export
#' @author Robert Stojnic, Diego Diez
#' @examples
#' 
#' # load in the two example de-novo motifs
#' motifs = readMotifs(system.file(package = "PWMEnrich", dir = "extdata", file = "example.transfac"), 
#'   remove.acc = TRUE)
#' 
#' \dontrun{
#'   # construct lognormal background
#'   bg.logn = makeBackground(motifs, organism="dm3", type="logn")
#'
#'   # alternatively, any BSgenome object can also be used
#'   if(requireNamespace("BSgenome.Dmelanogaster.UCSC.dm3"))
#'     bg.logn = makeBackground(motifs, organism=Dmelanogaster, type="logn")
#'
#'   # construct a Z-score of hits with P-value background
#'   bg.pval = makeBackground(motifs, organism="dm3", type="pval", p.value=1e-3)
#'
#'   # now we can use them to scan for enrichment in sequences (in this case there is a consensus 
#'   # Tin binding site).
#'   motifEnrichment(DNAString("TGCATCAAGTGTGTAGTG"), bg.logn)
#'   motifEnrichment(DNAString("TGCATCAAGTGTGTAGTG"), bg.pval)
#' }
#' 
makeBackground = function(motifs, organism="dm3", type="logn", quick=FALSE, bg.seq=NULL, ...){
	# check input parameters
	valid.types = c("logn", "cutoff", "pval", "empirical", "GEV")
	if(!(type %in% valid.types)){
		stop(paste("Invalid type, please choose from:", paste(valid.types, collapse=", ")))
	}

	# make sure we have this defined.... 
	if(!hasArg("bg.source"))
		bg.source = NULL
	else
		bg.source = list(...)$bg.source

	# use the explicitely set sequences
	if(!is.null(bg.seq)){
		bg.seq = .normalize.bg.seq(bg.seq)
	} else {
		# get the promoters for the organism
		promoters.all = getPromoters(organism)
		promoters = promoters.all$promoters
	
		if(quick){
			bg.seq = promoters[seq(1, length(promoters), length.out=100)]
			if(is.null(bg.source))
				bg.source = paste(promoters.all$organism, " (", promoters.all$version, ") 100 unique 2kb promoters", sep="")
		} else {
			if(type %in% c("pval")){
				bg.seq = promoters[seq(1, length(promoters), length.out=500)]
				if(is.null(bg.source))
					bg.source = paste(promoters.all$organism, " (", promoters.all$version, ") 500 unique 2kb promoters", sep="")
			} else {
				bg.seq = promoters
				if(is.null(bg.source))
					bg.source = paste(promoters.all$organism, " (", promoters.all$version, ") ", length(promoters), " unique 2kb promoters", sep="")
			}
		}
	}
	
	#bg.seq = DNAStringSetToList(bg.seq)
	
	# capture ... as parameters so we can remove bg.source if present
	# and set the other parameters
	params = list(...)	
	params$bg.seq = bg.seq
	params$motifs = motifs
	params$bg.source = bg.source
	
	# select the human algorithm
	if(type == "logn" && is.character(organism) && organism == "hg19" && !("algorithm" %in% names(params)))
		params$algorithm = "human"
	
	## now run the appropriate backend function
	if(type == "logn"){
		bg = do.call("makePWMLognBackground", params)
	} else if(type == "cutoff"){
		bg = do.call("makePWMCutoffBackground", params)
	} else if(type == "pval"){
		bg = do.call("makePWMPvalCutoffBackgroundFromSeq", params)
	} else if(type == "empirical"){
		bg = do.call("makePWMEmpiricalBackground", params)
	} else if(type == "GEV"){
		bg = do.call("makePWMGEVBackground", params)
	}
	
	return(bg)
}

#' Get the four nucleotides background frequencies
#'
#' Estimate the background frequencies of A,C,G,T on a set of promoters from an organism
#'
#' @param organism either a name of the organisms for which the background should be compiled 
#'                 (supported names are "dm3", "mm9" and "hg19"), a \code{BSgenome} object,
#'                  \code{DNAStringSet}, or list of \code{DNAString} objects
#' @param pseudo.count the number to which the frequencies sum up to, by default 1
#' @param quick if to preform fitting on a reduced set of 100 promoters. This will not give as good results but is much quicker than fitting to all the promoters (~10k). 
#'              Usage of this parameter is recommended only for testing and rough estimates.
#' @export
#' @author Robert Stojnic, Diego Diez
#' @examples
#' \dontrun{
#'   getBackgroundFrequencies("dm3")
#' }
getBackgroundFrequencies = function(organism="dm3", pseudo.count=1, quick=FALSE){
	# bug reported on support.bioconductor.org
	if (inherits(organism, "DNAStringSet")) {
		bg.seq = organism
	} else if (is.list(organism) && length(organism) > 0 && inherits(organism[[1]], "DNAString")) {
		bg.seq = organism
	} else {
		# pick the set of background sequences
		promoters = getPromoters(organism)$promoters
		
		if(quick){
			bg.seq = promoters[seq(1, length(promoters), length.out=100)]
		} else {
			bg.seq = promoters
		}
	}
	#bg.seq = DNAStringSetToList(bg.seq)
	
	makePriors(bg.seq, pseudo.count)
}

#' Calculate the empirical distribution score distribution for a set of motifs
#'
#' @param motifs a set of motifs, either a list of frequency matrices, or a list of PWM objects. If
#'               frequency matrices are given, the background distribution is fitted from bg.seq. 
#' @param organism either a name of the organisms for which the background should be compiled 
#'                 (supported names are "dm3", "mm9" and "hg19"), or a \code{BSgenome} object (see \code{BSgenome} package).
#' @param bg.seq a set of background sequence (either this or organism needs to be specified!). Can be a DNAString or DNAStringSet object.
#' @param quick if to do the fitting only on a small subset of the data (only in combination with \code{organism}). Useful only for code testing!
#' @param pseudo.count the pseudo count which is shared between nucleotides when frequency matrices are given
#' @return a list of \code{ecdf} objects (see help page for \code{ecdf} for usage). 
#' @export
motifEcdf = function(motifs, organism=NULL, bg.seq=NULL, quick=FALSE, pseudo.count=1){
	if(is.null(organism) && is.null(bg.seq)){
		stop("Either the 'organism' or 'bg.seq' parameter need to be specified!")
	}
	
	if(!is.list(motifs))
		motifs = list(motifs)
	
	if(!is.null(bg.seq)){
		# check if the sequences are in the right format
		bg.seq = .normalize.bg.seq(bg.seq)
	} else {
		# take only a single promoter from each of the genes
		promoters = getPromoters(organism)$promoters
		
		if(quick){
			bg.seq = promoters[seq(1, length(promoters), length.out=100)]
		} else {
			bg.seq = promoters
		}
	}
	
	# make priors and PWMs
	if(! inherits(motifs[[1]], "PWM")) {
		prior = makePriors(bg.seq, pseudo.count)
		pwms = PFMtoPWM(motifs, prior.params = prior)
	} else {
		pwms = motifs
	}	
	
	# always use the big memory implementation as it is much faster in typical usage!
	s = motifScoresBigMemory(bg.seq, pwms, raw.scores=TRUE)
	
	# group together distributions for motifs
	s = lapply(1:length(pwms), function(i) na.omit(unlist(lapply(s, function(x) x[,i]))))
	
	# create empirical CDFs and return
	e = lapply(s, ecdf)
	names(e) = names(pwms)
	
	e
}
Any scripts or data that you put into this service are public.
PWMEnrich documentation built on Nov. 8, 2020, 7:45 p.m.
rdrr.io home R language documentation Run R code online
CRAN packages Bioconductor packages R-Forge packages GitHub packages
Note that we can't provide technical support on individual packages. You should contact the package authors for that.
PWMEnrich
PWM enrichment analysis

R/background.R
In PWMEnrich: PWM enrichment analysis

Defines functions motifEcdf getBackgroundFrequencies makeBackground getPromoters makePWMGEVBackground makeStartEndPos makePWMPvalCutoffBackgroundFromSeq makePWMPvalCutoffBackground makePWMEmpiricalBackground makePWMCutoffBackground makePWMLognBackground makePriors .normalize.bg.seq

Documented in getBackgroundFrequencies getPromoters makeBackground makePriors makePWMCutoffBackground makePWMEmpiricalBackground makePWMGEVBackground makePWMLognBackground makePWMPvalCutoffBackground makePWMPvalCutoffBackgroundFromSeq makeStartEndPos motifEcdf .normalize.bg.seq

Try the PWMEnrich package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

PWMEnrich PWM enrichment analysis

R/background.R In PWMEnrich: PWM enrichment analysis

Defines functions motifEcdf getBackgroundFrequencies makeBackground getPromoters makePWMGEVBackground makeStartEndPos makePWMPvalCutoffBackgroundFromSeq makePWMPvalCutoffBackground makePWMEmpiricalBackground makePWMCutoffBackground makePWMLognBackground makePriors .normalize.bg.seq

Documented in getBackgroundFrequencies getPromoters makeBackground makePriors makePWMCutoffBackground makePWMEmpiricalBackground makePWMGEVBackground makePWMLognBackground makePWMPvalCutoffBackground makePWMPvalCutoffBackgroundFromSeq makeStartEndPos motifEcdf .normalize.bg.seq

Try the PWMEnrich package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

PWMEnrich
PWM enrichment analysis

R/background.R
In PWMEnrich: PWM enrichment analysis