R/pwm.R
In PWMEnrich: PWM enrichment analysis

Documented in affinitySequenceSet cutoffZscore cutoffZscoreSequenceSet empiricalPvalue empiricalPvalueSequenceSet gevPerSequence .inputParamMotifs .inputParamSequences keepFinite logNormPval logNormPvalSequenceSet matrixShuffleZscorePerSequence motifEnrichment motifIC motifPrAUC motifRecoveryAUC motifScores motifScoresBigMemory .normargPfm .normargPriorParams PFMtoPWM PWMUnscaled scanWithPWM toPWM

# Globals from Biostrings
DNA_BASES = c("A", "C", "G", "T")
DNA_ALPHABET = c("A", "C", "G", "T", "M", "R", "W", "S", "Y", "K", "V", "H", "D", "B", "N", "-", "+")

#' Input parameter normalization function for PWMUnscaled
#'
#' This function is from Biostrings package
#'
#' @param prior.params Typical 'prior.params' vector: c(A=0.25, C=0.25, G=0.25, T=0.25)
.normargPriorParams <- function(prior.params)
{
    if (!is.numeric(prior.params))
        stop("'prior.params' must be a numeric vector")
    if (length(prior.params) != length(DNA_BASES) ||
        !setequal(names(prior.params), DNA_BASES))
        stop("'prior.params' elements must be named A, C, G and T")
    ## Re-order the elements.
    prior.params <- prior.params[DNA_BASES]
    if (any(is.na(prior.params)) || any(prior.params < 0))
        stop("'prior.params' contains NAs and/or negative values")
    prior.params
}

#' Input parameter normalization for PWMUnscaled
#'
#' This function is from Biostrings package.
#' A Position Frequency Matrix (PFM) is also represented as an ordinary
#' matrix. Unlike a PWM, it must be of type integer (it will typically be
#' the result of consensusMatrix()).
#'
#' @param x a frequency matrix
.normargPfm <- function(x)
{
    if (!is.matrix(x) || !is.integer(x))
        stop("invalid PFM 'x': not an integer matrix")
    ## Check the row names.
    if (is.null(rownames(x)))
        stop("invalid PFM 'x': no row names")
    if (!all(rownames(x) %in% DNA_ALPHABET))
        stop("invalid PFM 'x': row names must be in 'DNA_ALPHABET'")
    if (!all(DNA_BASES %in% rownames(x)))
        stop("invalid PFM 'x': row names must contain A, C, G and T")
    if (any(duplicated(rownames(x))))
        stop("invalid PFM 'x': duplicated row names")
    ## Check the nb of cols.
    if (ncol(x) == 0L)
        stop("invalid PFM 'x': no columns")
    ## Check the values.
    if (any(is.na(x)) || any(x < 0L))
        stop("invalid PFM 'x': values cannot be NA or negative")
    if (any(x[!(rownames(x) %in% DNA_BASES), ] != 0L))
        stop("invalid PFM 'x': IUPAC ambiguity letters are represented")
    x <- x[DNA_BASES, , drop=FALSE]
    x
}

#' The PWM function from Biostrings without unit scaling
#'
#' By default the Biostrings package scales the log-odds score so it is within 0 and 1. In this function
#' we take a more traditional approach with no unit scaling and offer unit scaling as an additional parameter.
#'
#' See ?PWM from Biostrings for more information on input arguments. 
#'
#' @title Create a PWM from PFM
#' @param x the integer count matrix representing the motif, rows as nucleotides
#' @param id a systematic ID given to this PWM, could include the source, version, etc
#' @param name the name of the transcription factor (TF) to which the PWM corresponds to
#' @param type the type of PWM calculation, either as log2-odds, or posterior probability (frequency matrix)
#' @param prior.params the pseudocounts for each of the nucleotides 
#' @param pseudo.count the pseudo-count values if different from priors
#' @param unit.scale if to unit.scale the pwm (default is no unit scaling)
#' @param seq.count if x is a normalised PFM (i.e. with probabilities instead of sequence counts), then this sequence count
#'                  will be used to convert \code{x} into a count matrix
#' @return a new PWM object representing the PWM 
#' @export
#' @examples
#'
#' if(requireNamespace("PWMEnrich.Dmelanogaster.background")){
#'    data(MotifDb.Dmel.PFM, package = "PWMEnrich.Dmelanogaster.background")
#'
#'    ttk = MotifDb.Dmel.PFM[["ttk"]]
#'    
#'    # make a PWM with uniform background
#'    PWMUnscaled(ttk, id="ttk-JASPAR", name="ttk")
#'    
#'    # custom background
#'    PWMUnscaled(ttk, id="ttk-JASPAR", name="ttk", 
#'      prior.params=c("A"= 0.2, "C" = 0.3, "G" = 0.3, "T" = 0.2))
#'
#'    # get background for drosophila (quick mode on a reduced dataset)
#'    prior = getBackgroundFrequencies("dm3", quick=TRUE)
#'    
#'    # convert using genomic background
#'    PWMUnscaled(ttk, id="ttk-JASPAR", name="ttk", prior.params=prior)
#' }
#'
PWMUnscaled = function(x, id="", name="", type=c("log2probratio", "prob"), prior.params=c(A=0.25, C=0.25, G=0.25, T=0.25), pseudo.count=prior.params, 
	unit.scale=FALSE, seq.count=NULL){
	
	# convert to PFM if needed
	if(!is.null(seq.count)){
		x = apply(round(x * seq.count), 1:2, as.integer)
	}
	
	# match input params
    x <- .normargPfm(x)
    nseq <- colSums(x)
    type <- match.arg(type)
    prior.params <- .normargPriorParams(prior.params)
    
    # total prior added
    priorN <- sum(prior.params)
    
    # posterior probabilities with unequal number of counts per row
    postProbs <- divideRows(x + prior.params, nseq + priorN)
    rownames(postProbs) = rownames(x)
    colnames(postProbs) = colnames(x)
    
    # calculate logodds, or return probabilities
    if (type == "log2probratio") {
        if (any(prior.params == 0)) 
            stop("infinite values in PWM due to 0's in 'prior.params'")
        prior.probs <- prior.params/priorN
        ans <- log2(postProbs/prior.probs)
    }
    else {
        ans <- postProbs
    }
    
    # if to apply unit.scale
    if(unit.scale)
	   	ans = unitScale(ans)
	   	
	# return an object 
	new("PWM", id=id, name=name, pfm=x, prior.params=prior.params, pwm=ans)
}

#' Scan the whole sequence on both strands
#'
#' The whole sequence is scanned with a PWM and scores returned beginning at each position. Partial motif
#' matches are not done, thus the last #[length of motif]-1 scores are NA. 
#' 
#' The function returns either an odds average (*not* log-odds average), maximal score on each strand,
#' or scores on both strands. 
#'
#' The function by default returns the score in log2 following the package \code{Biostrings}. 
#'
#' @param pwm PWM object
#' @param dna a DNAString or other sequence from Biostrings
#' @param pwm.rev the reverse complement for a pwm (if it is already pre-computed)
#' @param odds.score if to return raw scores in odds (not logodds) space
#' @param both.strands if to return results on both strands
#' @param strand.fun which function to use to summarise values over two strands (default is "mean")
#' @return a vector representing scores starting at each position, or a matrix with score in the two strands
#' @export
#' @examples
#' 
#' if(requireNamespace("PWMEnrich.Dmelanogaster.background")){
#'    data(MotifDb.Dmel, package = "PWMEnrich.Dmelanogaster.background")
#'
#'    ttk = MotifDb.Dmel[["ttk"]]
#'
#'    # odds average over the two strands expressed as log2-odds
#'    scanWithPWM(ttk, DNAString("CGTAGGATAAAGTAACT"))
#'    
#'    # log2-odds scores on both strands
#'    scanWithPWM(ttk, DNAString("CGTAGGATAAAGTAACT"), both.strands=TRUE)
#' }
#' 
scanWithPWM = function(pwm, dna, pwm.rev=NULL, odds.score=FALSE, both.strands=FALSE, strand.fun="mean"){
	if(is.character(dna))
		dna = DNAString(dna)

	if(!inherits(dna, c("DNAString", "DNAStringSet")))
		stop("The input sequence needs to be either of type DNAString or DNAStringSet")
	

	if(is.null(pwm.rev))
		pwm.rev = reverseComplement(pwm)
		
	# extract only the PWM matrices
	pwm = pwm$pwm
	pwm.rev = pwm.rev$pwm
	
	if(length(dna) < ncol(pwm)){
		stop("DNA sequence needs to be at least as long as the PWM")
	}

	fwd.motif = PWMscoreStartingAt(pwm, dna, starting.at=1:(length(dna)-ncol(pwm)+1))
	back.motif = PWMscoreStartingAt(pwm.rev, dna, starting.at=1:(length(dna)-ncol(pwm)+1))
	
	if(both.strands){
		res = cbind(fwd.motif, back.motif)
		colnames(res) = c("fwd", "rev")
		res = rbind(res, matrix(NA, nrow=(ncol(pwm)-1), ncol=2))
	} else {
		if(strand.fun == "max"){
			res = apply(cbind(fwd.motif, back.motif), 1, max)
		} else if(strand.fun == "mean"){
			res = 2^cbind(fwd.motif, back.motif)
			res = log2(rowMeans(res))
		} else {
			stop("Unknown strand function", strand.fun, ". Valid values are: mean, max")
		}
		# pad out the result so it is of same length as input sequence
		res = c(res, rep(NA, ncol(pwm)-1))
	}
	
	if(odds.score)
		return(2^res)
	else
		return(res)
}

#' Convert frequencies into motifs using PWMUnscaled
#'
#' Note that this function is deprecated and replaced by \code{toPWM()}. 
#'
#' @param motifs a list of motifs represented as matrices of frequencies (PFM)
#' @param id the set of IDs for the motifs (defaults to names of the 'motifs' list)
#' @param name the set of names for the motifs (defaults to names of the 'motifs' list)
#' @param seq.count if frequencies in the motifs are normalized to 1, provides a vector of sequence counts (e.g. for MotifDb motifs)
#' @param ... other parameters to PWMUnscaled
#'
#' @export
#' @examples
#' \dontrun{
#' if (requireNamespace("PWMEnrich.Dmelanogaster.background")) {
#'   data(MotifDb.Dmel.PFM, package = "PWMEnrich.Dmelanogaster.background")
#'
#'   # convert to PWM with uniform background
#'   PFMtoPWM(MotifDb.Dmel.PFM)
#'
#'   # get background for drosophila (quick mode on a reduced dataset)
#'   prior = getBackgroundFrequencies("dm3", quick=TRUE)
#'    
#'   # convert with genomic background 
#'   PFMtoPWM(MotifDb.Dmel.PFM, prior.params=prior)
#' }
#' }
PFMtoPWM = function(motifs, id=names(motifs), name=names(motifs), seq.count=NULL, ...){
	if(!is.list(motifs)){
		was.list = FALSE
		motifs = list(motifs)
	} else {
		was.list = TRUE
	}
	
	if(is.null(id))
		id = rep("", length(motifs))
	if(is.null(name))
		name = rep("", length(motifs))
	
	if(length(id) != length(motifs))
		stop("The number of IDs (parameter 'id') need to be the same as number of motifs (parameter 'motifs')")

	if(length(name) != length(motifs))
		stop("The number of names (parameter 'name') need to be the same as number of motifs (parameter 'motifs')")
		
	if(!is.null(seq.count) && length(seq.count) != length(motifs)){
		stop("The 'seq.count' vector needs to be of the same length as the list of motifs (parameters 'motifs')")
	}
	
	# call PWMUnscaled
	res = list()
	for(i in 1:length(motifs)){
		if(is.null(seq.count)){
			res[[i]] = PWMUnscaled(motifs[[i]], id=id[i], name=name[i], ...)
		} else {
			res[[i]] = PWMUnscaled(motifs[[i]], id=id[i], name=name[i], seq.count=seq.count[i], ...)
		}
	}
	
	names(res) = names(motifs)
	
	# convert back to single object if that's how the input was
	if(!was.list){
		return(res[[1]])
	} else {
		return(res)
	}

}

#' Convert motifs into PWMs
#'
#' @param motifs a list of motifs either as position probability matrices (PPM) or frequency matirces (PFMs)
#' @param ids the set of IDs for the motifs (defaults to names of the 'motifs' list)
#' @param targets the set of target TF names for the motifs (defaults to names of the 'motifs' list)
#' @param seq.count provides a vector of sequence counts for probability matrices (PPMs). Default it 50. 
#' @param prior frequencies of the four letters in the genome. Default is uniform background.
#' @param ... other parameters to PWMUnscaled
#'
#' @export
#' @examples
#' \dontrun{
#' if (requireNamespace("PWMEnrich.Dmelanogaster.background")) {
#'    data(MotifDb.Dmel.PFM, package = "PWMEnrich.Dmelanogaster.background")
#'
#'    toPWM(MotifDb.Dmel.PFM) # convert to PWM with uniform background
#'
#'    # get background for drosophila (quick mode on a reduced dataset)
#'    prior = getBackgroundFrequencies("dm3", quick=TRUE)
#'    toPWM(MotifDb.Dmel.PFM, prior=prior) # convert with genomic background 
#' }
#' }
toPWM = function(motifs, ids=names(motifs), targets=names(motifs), seq.count=50, prior=c(A=0.25, C=0.25, G=0.25, T=0.25), ...){
	if(!is.list(motifs)){
		was.list = FALSE
		motifs = list(motifs)
	} else {
		was.list = TRUE
	}
	
	# extend seq.count if needed
	if(length(seq.count) == 1 && length(seq.count) != length(motifs))
		seq.count = rep(seq.count, length(motifs))
	
	# convert each individual motif into integers or multiply with seq.count
	for(i in 1:length(motifs)){
		m = motifs[[i]]
		
		mi = apply(m, 1:2, as.integer)
		if(all(m == mi)){
			# can be coersced to an integer matrix, use that
			motifs[[i]] = mi
		} else {
			# if the values is NA use 50 as default
			if(is.na(seq.count[i])){
				motifs[[i]] = apply(round(m * 50), 1:2, as.integer)
			} else {
				motifs[[i]] = apply(round(m * seq.count[i]), 1:2, as.integer)
			}
		}
		
	}
	
	if(is.null(ids))
		ids = rep("", length(motifs))
	if(is.null(targets))
		targets = rep("", length(motifs))
	
	if(length(ids) != length(motifs))
		stop("The number of IDs (parameter 'ids') need to be the same as number of motifs (parameter 'motifs')")

	if(length(targets) != length(motifs))
		stop("The number of TF target names (parameter 'targets') need to be the same as number of motifs (parameter 'motifs')")
	
	# call PWMUnscaled
	res = list()
	for(i in 1:length(motifs)){
		res[[i]] = PWMUnscaled(motifs[[i]], id=ids[i], name=targets[i], prior.params=prior, ...)
	}
	
	names(res) = names(motifs)
	
	# convert back to single object if that's how the input was
	if(!was.list){
		return(res[[1]])
	} else {
		return(res)
	}

}

#' Normalizes the motifs input argument for multiple functions
#'
#' @param motifs a list of motifs either as frequency matrices (PFM) or as PWM objects. If PFMs are specified
#'               they are converted to PWMs using uniform background. 
.inputParamMotifs = function(motifs){
	# check motifs format and convert to PWM
	if(!is.list(motifs))
		motifs = list(motifs)
		
	if(is.matrix(motifs[[1]]))
		pwms = PFMtoPWM(motifs)
	else if(inherits(motifs[[1]], "PWM"))
		pwms = motifs
	else
		stop("motifs need to be either frequency matrices or PWM objects")
		
	return(pwms)
}

#' Normalize the sequences input argument
#'
#' @param sequences a set of sequences to be scanned, a list of DNAString or other scannable objects
.inputParamSequences = function(sequences){
	if(is.character(sequences)){
		sequences = readDNAStringSet(sequences)
	}
	# make sure sequences are in the right format
	if(!is.list(sequences) & !inherits(sequences, "DNAStringSet"))
		sequences = list(sequences)
	
	if(is.list(sequences) & length(sequences)>0){
		sequences = lapply(sequences, function(s){
			if(is.character(s))
				DNAString(s)
			else
				s
		})
	}
		
	#if(class(sequences) == "DNAStringSet")
	#	sequences = DNAStringSetToList(sequences)
		
	return(sequences)
}

#' Motif affinity or number of hits over a threshold
#'
#' Scan a number of sequences either to find overall affinity, or a number of hits over a score threshold. 
#'
#' @param sequences a set of sequences to be scanned, a list of DNAString or other scannable objects
#' @param motifs a list of motifs either as frequency matrices (PFM) or as PWM objects. If PFMs are specified
#'               they are converted to PWMs using uniform background. 
#' @param raw.scores if to return raw scores (odds) for each position in the sequence. Note that scores for forward and reverse 
#'                   strand are concatenated into a single long vector of scores (twice the length of the sequence)
#' @param verbose if to print verbose output
#' @param cutoff if not NULL, will count number of matches with score above value specified (instead of returning the average affinity). 
#'               Can either be one value, or a vector of values for each of the motifs. 
#' @return if raw.scores=FALSE, returns a matrix of mean scores (after cutoff if any), where columns are motifs. 
#'         The returned values are either mean odd scores (not log-odd), or number of hits above a threshold;
#'         otherwise if raw.scores=TRUE, returns a list of raw score values (before cutoff)
#' @export
#' @examples
#'
#' if(requireNamespace("PWMEnrich.Dmelanogaster.background")){
#'    data(MotifDb.Dmel, package = "PWMEnrich.Dmelanogaster.background")
#'
#'    # affinity scores
#'    affinity = motifScores(DNAString("CGTAGGATAAAGTAACTAGTTGATGATGAAAG"), MotifDb.Dmel)
#'    
#'    # motif hit count with Patser score of 4
#'    counts = motifScores(DNAString("CGTAGGATAAAGTAACTAGTTGATGATGAAAG"), MotifDb.Dmel, 
#'      cutoff=log2(exp(4)))
#'    
#'    print(affinity)
#'    print(counts)
#'
#'    # scanning multiple sequences
#'    sequences = list(DNAString("CGTAGGATAAAGTAACTAGTTGATGATGAAAG"), 
#'      DNAString("TGAGACGAAGGGGATGAGATGCGGAAGAGTGAAA"))
#'    affinity2 = motifScores(sequences, MotifDb.Dmel)
#'    print(affinity2)
#' }
motifScores = function(sequences, motifs, raw.scores=FALSE, verbose=TRUE, cutoff=NULL){	
	if(!is.null(.PWMEnrich.Options[["useBigMemory"]]) && .PWMEnrich.Options[["useBigMemory"]])
		return(motifScoresBigMemory(sequences, motifs, raw.scores, verbose, cutoff))

	# check motifs format and convert to PWM
	pwms = .inputParamMotifs(motifs)
	sequences = .inputParamSequences(sequences)
	
	# check if any of the sequences is all Ns, if so, produce an error
	if(is(sequences, "DNAStringSet")){
		al = alphabetFrequency(sequences)
	} else {
		al = do.call("rbind", lapply(sequences, alphabetFrequency))
	}
	
	# check if any of the sequences is all Ns
	all.n = which(al[,"N"] == rowSums(al))
	if(length(all.n) >0){
		stop(paste("Sequence with index(es):", paste(all.n, collapse=","), "only contain N's. Please remove this sequence as the PWM score cannot be computed for any position."))
	}
		
	pwms.rev = lapply(pwms, reverseComplement)
	
	# always unfold cutoff into a vector of values
	if(!is.null(cutoff)){
		if(length(cutoff) == 1){
			cutoff = rep(cutoff, length(pwms))
		} else if(length(cutoff) != length(pwms)) {
			stop("The lengths of cutoff and pwms do not match. Either provide one values for cutoff, or a vector of values, one for each PWM")
		} 
	}
	
	# inner loop function to use for parallel processing
	motifScoresLoop = function(i){
		if(verbose)
			message(paste("Scanning sequence", i, "/", length(sequences)))
		s = sequences[[i]]
		
		# if using raw scores, record matrix of values;
		# otherwise a single final value for each pwm
		if(raw.scores){
			r = matrix(0, nrow=length(s)*2, ncol=length(pwms))
			colnames(r) = names(pwms)
		} else {
			r = rep(0, length(pwms))
			names(r) = names(pwms)
		}
		
		for(j in 1:length(pwms)){
			# if we are interested in counts, use max over two strands
			
			if(raw.scores){
				# record raw scores that are averages over strands
				r[,j] = as.vector(scanWithPWM(pwms[[j]], s, pwms.rev[[j]], odds.score=TRUE, both.strands=TRUE))
			} else if(!is.null(cutoff)){
				# count number of hits on both strands
				r.pwm = scanWithPWM(pwms[[j]], s, pwms.rev[[j]], odds.score=FALSE, both.strands=TRUE)
				r[j] = sum(r.pwm >= cutoff[j], na.rm=TRUE)
			} else {
				# do average over both strands and also over all values
				r[j] = mean(scanWithPWM(pwms[[j]], s, pwms.rev[[j]], odds.score=TRUE), na.rm=TRUE)
			}
		}
		
		r
	}
	
	####################################### END OF INNER LOOP ################

	# either do it parallel or serial
	if(!is.null(.PWMEnrich.Options[["numCores"]])){
		cat("Parallel scanning with", .PWMEnrich.Options[["numCores"]], "cores\n")
		# do it in parallel
		res = parallel::mclapply(1:length(sequences), motifScoresLoop, mc.cores = .PWMEnrich.Options[["numCores"]])
		
		if(is.list(res)){
			if( any(sapply(res, is.null)) ){
				stop("Parallel scanning failed for some sequences. This could be due to a number of reasons including not enough memory.")
			}
		}
	} else {
		# do it serial
		res = lapply(1:length(sequences), motifScoresLoop)
	}
	
	## return either raw scores, or counts or means
	if(raw.scores){
		names(res) = names(sequences)
		return(res)
	} else {
		if(length(pwms) == 1){
			return( matrix(sapply(res, identity), ncol=1, dimnames=list(NULL, names(pwms))) )
		} else {
			r = t(sapply(res, identity))
			rownames(r) = names(sequences)
			colnames(r) = names(pwms)
			return( r )
		}
	}
}

#' This is a memory intensive version of motifScore() which is about 2 times faster
#'
#' The parameters and functionality are the same as \code{\link{motifScores}}. Please refer to documentation of this function
#' for detailed explanation of functionality. 
#'
#' This function is not meant to be called directly, but is indirectly called by motifScores() once a global parameters useBigMemory is set. 
#'
#' @param sequences set of input sequences
#' @param motifs set of input PWMs or PFMs
#' @param raw.scores if to return scores for each base-pair
#' @param verbose if to produce verbose output
#' @param cutoff the cutoff for calling binding sites (in base 2 log). 
#' @param seq.all already concatenated sequences if already available (used to internally speed up things)
#'
#' @seealso \code{\link{motifScores}}
motifScoresBigMemory = function(sequences, motifs, raw.scores=FALSE, verbose=TRUE, cutoff=NULL, seq.all=NULL){	
	# check motifs format and convert to PWM
	pwms = .inputParamMotifs(motifs)
	sequences = .inputParamSequences(sequences)
		
	pwms.rev = lapply(pwms, reverseComplement)
	pwms.len = sapply(pwms, length)
	min.pwm.len = min(pwms.len)
	
	# always unfold cutoff into a vector of values
	if(!is.null(cutoff)){
		if(length(cutoff) == 1){
			cutoff = rep(cutoff, length(pwms))
		} else if(length(cutoff) != length(pwms)) {
			stop("The lengths of 'cutoff' and 'pwms' do not match. Either provide one values for 'cutoff' or a vector of values, one for each PWM")
		} 
		
		# cutoff in odds space
		cutoff.2 = 2^cutoff
	}
	
	# work on a single sequence that is all concatenated together because it's faster
	if(is.null(seq.all))
		seq.all = DNAString(concatenateSequences(sequences))
	seq.len = sapply(sequences, length)
	
	# a common error with shorter sequences
	shorter.seq = which(seq.len < min.pwm.len)
	if(length(shorter.seq)>0){
		stop(paste(length(shorter.seq), "sequence(s) have length shorter than the shortest PWM (", min.pwm.len, "), please identify and remove these sequences."))
	}
	
	# a grouping data frame
	seq.len.sum = cumsum(seq.len)
	seq.group = data.frame("from"=rep(0, length(sequences)), "to"=0)
	seq.group$from = c(1, 1+seq.len.sum[-length(seq.len.sum)])
	seq.group$to = seq.len.sum
	
	# inner loop function to use for parallel processing
	#
	# @param motif.start the index of the first motif
	# @param motif.end the index of the last motif 
	motifScoresLoop = function(input.params){
		motif.start = input.params[[1]]
		motif.end = input.params[[2]]
	
		if(verbose){
			if(motif.start == motif.end){
				message(paste("Starting scanning with motif", names(pwms)[motif.start]))
			} else {
				message(paste("Starting scanning with motifs from", motif.start, "to", motif.end))
			}
		}
		# local copy of the sequences and motifs used in the function
		s = seq.all
		s.group = seq.group
		pwms.l = pwms[motif.start:motif.end]
		pwms.rev.l = pwms.rev[motif.start:motif.end]
		pwms.len.l = pwms.len[motif.start:motif.end]
		if(!is.null(cutoff))
			cutoff.2.l = cutoff.2[motif.start:motif.end]
		
		# create the output structure
		res = list()
		for(i in 1:nrow(s.group)){
			if(raw.scores){			
				r = matrix(0, nrow=seq.len[i]*2, ncol=length(pwms.l))
				colnames(r) = names(pwms.l)
				res[[i]] = r		
			} else {
				r = rep(0, length(pwms.l))
				names(r) = names(pwms.l)		
				res[[i]] = r
			}
		}
		
		### do the scanning, convert back to individual sequences and do the post-processing
		for(j in 1:length(pwms.l)){
			if(verbose){
				message(paste("Scanning all sequences with motif", j, "/", length(pwms.l)))
			}
			raw = as.vector(scanWithPWM(pwms.l[[j]], s, pwms.rev.l[[j]], odds.score=TRUE, both.strands=TRUE))
			
			### split the raw results by sequence
			for(i in 1:nrow(s.group)){
				# selector for this sequence
				from = seq.group$from[i]
				to = seq.group$to[i]
				from.rev = length(raw)/2 + seq.group$from[i]
				to.rev = length(raw)/2 + seq.group$to[i]
				sel = c(from:to, from.rev:to.rev)
			
				r = raw[sel]
				
				# fwd strand
				r[((seq.len[i]-pwms.len.l[j])+2):seq.len[i]] = NA
				# rev strand
				r[((2*seq.len[i]-pwms.len.l[j])+2):(2*seq.len[i])] = NA

				# save the values				
				if(raw.scores){
					res[[i]][,j] = r
				} else if(!is.null(cutoff)){
					# count number of hits on both strands 
					res[[i]][j] = sum(r >= cutoff.2.l[j], na.rm=TRUE)
				} else {
					# MeanAffinity
					res[[i]][j] = mean(r, na.rm=TRUE)
				}
				
			}
		}
				
		res
	}
	######################################## END OF INNER LOOP ######################################

	# either do it parallel or serial
	if(!is.null(.PWMEnrich.Options[["numCores"]]) && length(pwms) >= .PWMEnrich.Options[["numCores"]]){
		cores = .PWMEnrich.Options[["numCores"]]
		sel = round(seq(0, length(pwms), length.out=cores+1))
		start = sel[1:cores]+1
		end = sel[2:(cores+1)]
		input = lapply(1:cores, function(i) list(start[i], end[i]))
		# do it in parallel
		res.parallel = parallel::mclapply(input, motifScoresLoop, mc.cores = cores)
		
		if(is.list(res.parallel)){
			if( any(sapply(res.parallel, is.null)) ){
				stop("Parallel scanning failed for some sequences. This could be due to a number of reasons including not enough memory.")
			}
		}	

		# concatenate results for different motifs		
		if(raw.scores)
			res = lapply(1:length(sequences), function(i) do.call("cbind", lapply(res.parallel, function(x) x[[i]])))	
		else
			res = lapply(1:length(sequences), function(i) do.call("c", lapply(res.parallel, function(x) x[[i]])))
	} else {
		# do it serial
		res = motifScoresLoop(list(1, length(pwms)))
	}
	
	
	
	## return either raw scores, or counts or means
	if(raw.scores){
		names(res) = names(sequences)
		return(res)
	} else {
		if(length(pwms) == 1){
			return( matrix(sapply(res, identity), ncol=1, dimnames=list(names(sequences), names(pwms))) )
		} else {
			r = t(sapply(res, identity))
			rownames(r) = names(sequences)
			colnames(r) = names(pwms)
			return( r )
		}
	}
}


#' Information content for a PWM or PFM
#'
#' @param motif a matrix of frequencies, or a PWM object
#' @param prior.params the prior parameters to use when a matrix is given (ignored if motif is already a PWM)
#' @param bycol if to return values separately for each column
#' @return information content in bits (i.e. log2)
#' @export
#' @examples
#'
#' if(requireNamespace("PWMEnrich.Dmelanogaster.background")){
#'    data(MotifDb.Dmel, package = "PWMEnrich.Dmelanogaster.background")
#'    data(MotifDb.Dmel.PFM, package = "PWMEnrich.Dmelanogaster.background")
#'
#'    # the nucleotide distribution is taken from the PWM (in this case genomic background)
#'    motifIC(MotifDb.Dmel[["ttk"]]) 
#'    # information content with default uniform background because the input is a matrix, 
#'    # not PWM object
#'    motifIC(MotifDb.Dmel.PFM[["ttk"]]) 
#' }
motifIC = function(motif, prior.params=c(A=0.25, C=0.25, G=0.25, T=0.25), bycol=FALSE){
	if(inherits(motif, "PWM")) {
		p = PWMUnscaled(motif$pfm, type="prob", prior.params=motif$prior.params)
		bg = motif$prior.params
	} else {
		p = PWMUnscaled(motif, type="prob", prior.params=prior.params)
		bg = prior.params
	}
		
	bg = bg/sum(bg) # normalize to 1
	
	cols = p$pwm * log2(p$pwm/bg)
	if(bycol)
		return(colSums(cols))
	else
		return(sum(cols))
}

#' Calculate motif enrichment using one of available scoring algorithms and background corrections. 
#'
#' This function provides and interface to all algorithms available in PWMEnrich
#' to find motif enrichment in a single or a group of sequences with/without
#' background correction. 
#'
#' Since for all algorithms the first step involves calculating raw scores without background correction, the output
#' always contains the scores without background correction together with (optional) background-corrected
#' scores. 
#'
#' Unless otherwise specified the scores are returned both separately for each sequence (without/with background) and
#' for the whole group of sequences (without/with background). 
#'
#' To use a background correction you need to supply a set of PWMs with precompiled background distribution parameters
#' (see function \code{\link{makeBackground}}). When such an object is supplied as the \code{pwm} parameter, the scoring 
#' scheme and background correction are automatically determined. 
#'
#' There are additional packages with already pre-computed background (e.g. see package \code{PWMEnrich.Dmelanogaster.background}).
#'
#' Please refer to (Stojnic & Adryan, 2012) for more details on the algorithms. 
#'
#' @title Motif enrichment
#' @param sequences the sequences to be scanned for enrichment. Can be either a single sequence 
#'        (an object of class DNAString), or a list of DNAString objects, or a DNAStringSet object.
#' @param pwms this parameter can take multiple values depending on the scoring scheme and background correction used.
#'             When the \code{method} parameter is set to "autodetect", the following default algorithms
#'             are going to be used: 
#'        \itemize{
#'           \item if \code{pwms} is a list containing either frequency matrices or a list of PWM objects then
#'                 the "affinity" algorithm is selected. If frequency matrices are given, they are converted 
#'                 to PWMs using uniform background. For best performance, convert frequency matrices to PWMs
#'                 before calling this function using realistic genomic background. 
#'           \item Otherwise, appropriate scoring scheme and background correction are selected based on the
#'                 class of the object (see below). 
#'        }
#' @param score this parameter determines which scoring scheme to use. Following scheme as available:
#'        \itemize{
#'           \item "autodetect" - default value. Scoring method is determined based
#'                 on the type of \code{pwms} parameter. 
#'           \item "affinity" - use threshold-free affinity score. The \code{pwms}
#'                 parameter can either be a list of frequency matrices, \code{PWM} objects, or a 
#'                 \code{PWMLognBackground} object. 
#'           \item "cutoff" - use number of motif hits above a score cutoff. The \code{pwms}
#'                 parameter can either be a list of frequency matrices, \code{PWM} objects, or a 
#'                 \code{PWMCutoffBackground} object.
#'           \item "clover" - use the Clover algorithm (Frith et al, 2004). The Clover score of a single 
#'                 sequence is identical to the affinity score, while for a group of sequences is an
#'                 average of products of affinities over all sequence subsets. 
#'         }
#'            
#' @param bg this parameter determines how the raw score is compared to the background distribution. 
#'        \itemize{
#'           \item "autodetect" - default value. Background correction is determined based on the type
#'                  of the \code{pwms} parameter. 
#'           \item "logn" - use a lognormal distribution background pre-computed for a set of PWMs.
#'                 This requires \code{pwms} to be of class \code{PWMLognBackground}.
#'           \item "z" - use a z-score for the number of significant motif hits compared to background number of hits.
#'                 This requires \code{pwms} to be of class \code{PWMCutoffBackground}.
#'           \item "pval" - use empirical P-value based on a set of background sequences. This requires
#'                 \code{pwms} to be of class \code{PWMEmpiricalBackground}. Note that PWMEmpiricalBackground
#'                 objects tend to be very large so that the empirical P-value can be calculated in reasonable time.
#'           \item "ms" - shuffle columns of motif matrices and use that as basis for P-value calculation. Note that
#'                  since the sequences need to rescanned with all of the new shuffled motifs this can be very slow. 
#'                  Also, this also works only no *individual* sequences, not groups. 
#'           \item "none" - no background correction
#'         }
#' @param cutoff the score cutoff for a significant motif hit if scoring scheme "cutoff" is selected. 
#' @param verbose if to print verbose output
#' @param motif.shuffles number of times to shuffle motifs if using "ms" background correction
#' @param B number of replicates when calculating empirical P-value
#' @param group.only if to return statistics only for the group of sequences, not individual sequences. In the case of
#'                   empirical background the P-values for individual sequences are not calculated (thus saving time), for other 
#'                   backgrounds they are calculated but not returned.  
#' @return a MotifEnrichmentResults object containing a subset following elements:
#'         \itemize{
#'           \item "score" - scoring scheme used
#'           \item "bg" - background correction used
#'           \item "params" - any additional parameters
#'			 \item "sequences" - the set of sequences used
#'           \item "pwms" - the set of pwms used
#'           \item "sequence.nobg" - per-sequence scores without any background correction. 
#'                   For "affinity" and "clover" a matrix of mean affinity scores; for
#'                   "cutoff" number of significant hits above a cutoff 
#'           \item "sequence.bg" - per-sequence scores after background correction. For "logn" and "pval" the P-value (smaller is better);
#'                                 for "z" and "ms" background corrections the z-scores (bigger is better). 
#'           \item "group.nobg" - aggregate scores for the whole group of sequences without background correction. For "affinity"
#'                                and "clover" the mean affinity over all sequences in the set; for "cutoff" the total number of hits in all
#'                                sequences.
#'           \item "group.bg" - aggregate scores for the whole group of sequences with background correction. For "logn" and "pval",
#'                              the P-value for the whole group (smaller is better); for "z" and "ms" the z-score for the whole set (bigger is better).
#'           \item "sequence.norm" - (only for "logn") the length-normalized scores for each of the sequences. Currently only implemented
#'                                   for "logn", where it returns the values normalized from LogN(0,1) distribution
#'           \item "group.norm" - (only for "logn") similar to sequence.norm, but for the whole group of sequences
#'         }
#' @export
#' @references \itemize{ 
#'  \item R. Stojnic & B. Adryan: Identification of functional DNA motifs using a binding affinity lognormal background distribution, submitted.
#' 	\item MC Frith et al: Detection of functional DNA motifs via statistical over-representation, Nucleid Acid Research (2004).
#' }
#' @examples 
#'
#' if(requireNamespace("PWMEnrich.Dmelanogaster.background")){
#'    ###
#'    # load the pre-compiled lognormal background
#'    data(PWMLogn.dm3.MotifDb.Dmel, package = "PWMEnrich.Dmelanogaster.background")
#'
#'    # scan two sequences for motif enrichment
#'    sequences = list(DNAString("GAAGTATCAAGTGACCAGTAGATTGAAGTAGACCAGTC"), 
#'      DNAString("AGGTAGATAGAACAGTAGGCAATGGGGGAAATTGAGAGTC"))
#'    res = motifEnrichment(sequences, PWMLogn.dm3.MotifDb.Dmel)
#'
#'    # most enriched in both sequences (lognormal background P-value)
#'    head(motifRankingForGroup(res))
#'
#'    # most enriched in both sequences (raw affinity, no background)
#'    head(motifRankingForGroup(res, bg=FALSE))
#'
#'    # most enriched in the first sequence (lognormal background P-value)
#'    head(motifRankingForSequence(res, 1))
#'
#'    # most enriched in the first sequence (raw affinity, no background)
#'    head(motifRankingForSequence(res, 1, bg=FALSE))
#'
#'    ###
#'    # Load the pre-compiled background for hit-based motif counts with cutoff of P-value = 0.001 
#'    data(PWMPvalueCutoff1e3.dm3.MotifDb.Dmel, package = "PWMEnrich.Dmelanogaster.background")
#'
#'    res.count = motifEnrichment(sequences, PWMPvalueCutoff1e3.dm3.MotifDb.Dmel)
#'
#'    # Enrichment in the whole group, z-score for the number of motif hits
#'    head(motifRankingForGroup(res))
#'
#'    # First sequence, sorted by number of motif hits with P-value < 0.001
#'    head(motifRankingForSequence(res, 1, bg=FALSE))
#'    
#' }
motifEnrichment = function(sequences, pwms, score="autodetect", bg="autodetect", cutoff=NULL, verbose=TRUE, motif.shuffles=30, B=1000,
	group.only=FALSE){
	
	# detect scoring method
	if(score == "autodetect"){
		if(inherits(pwms, "PWMLognBackground")) {
			score = "affinity"
		} else if(inherits(pwms, "PWMCutoffBackground")) {
			score = "cutoff"
		} else {
			score = "affinity"
		}
	}
	
	if(score == "cutoff" & is.null(cutoff))
		cutoff = pwms@bg.cutoff
	
	# the PWM* object
	pwmobj = NULL
	
	# detect background correction
	if(!(bg %in% c("none", "ms"))){
		if(inherits(pwms, "PWMLognBackground")) {
			bg = "logn"
			pwmobj = pwms
			pwms = pwmobj@pwms
		} else if(inherits(pwms, "PWMCutoffBackground")) {
			bg = "z"
			pwmobj = pwms
			pwms = pwmobj@pwms
		} else if(inherits(pwms, "PWMEmpiricalBackground")) {
			bg = "pval"
			pwmobj = pwms
			pwms = pwmobj@pwms
		} else if(inherits(pwms, "PWMGEVBackground")) {
			bg = "gev"
			pwmobj = pwms
			pwms = pwmobj@pwms
		} else {
			bg = "none"
		}
	}

	if(!is.list(pwms))
		pwms = list(pwms)
	
	sequences = .inputParamSequences(sequences)
	
	# return list
	res = list(score=score, bg=bg, pwms=pwms, sequences=sequences, params=list())
	
	# apply no-bg scoring
	if(score == "affinity"){
		# lengths of sequences and PWMs
		seq.len = sapply(sequences, length)
		pwm.len = sapply(pwms, length)
		
		res$sequence.nobg = motifScores(sequences, pwms, verbose=verbose)
		res$group.nobg = affinitySequenceSet(res$sequence.nobg, seq.len, pwm.len)
	} else if(score == "clover"){
		res$sequence.nobg = motifScores(sequences, pwms, verbose=verbose)
		res$group.nobg = cloverScore(res$sequence.nobg, verbose=verbose)
	} else if(score == "cutoff"){
		res$params = list(cutoff=cutoff)
		res$sequence.nobg = motifScores(sequences, pwms, cutoff=cutoff, verbose=verbose)
		res$group.nobg = colSums(res$sequence.nobg)
	} else {
		stop(paste("Unknown scoring algorithm: '", score, "'. Please select one of: 'affinity', 'cutoff', 'clover'", sep=""))
	}
	
	if(verbose){
		cat("Calculating motif enrichment scores ...\n")
	}
	
	# apply background correction if needed
	if(bg == "none"){
		res$sequence.bg = NULL
		res$group.bg = NULL
	} else if(bg == "logn"){
		# lengths of sequences and PWMs
		seq.len = sapply(sequences, length)
		pwm.len = sapply(pwms, length)
		# do it per sequence
		res$sequence.bg = logNormPval(res$sequence.nobg, seq.len, pwm.len, pwmobj@bg.mean, pwmobj@bg.sd, pwmobj@bg.len)
		colnames(res$sequence.bg) = names(pwms)
		# convert logn P-values into normalized observations and run Clover on that
		res$sequence.norm = apply(res$sequence.bg, 1:2, qlnorm, lower.tail=FALSE)
		
		if(score == "affinity"){
			# and for the group
			if(is.matrix(pwmobj@bg.mean)){
				# convert back to P-values, using the chi-sq distribution
				res$group.bg = apply(res$sequence.bg, 2, function(x) {
					pchisq(-2*sum(log(x)), 2*length(x), lower.tail=FALSE)
				})
				#res$group.bg = colMeans(log(res$sequence.bg))
				res$group.norm = sapply(res$group.bg, qlnorm, lower.tail=FALSE)
			} else {
				res$group.bg = logNormPvalSequenceSet(res$sequence.nobg, seq.len, pwm.len, pwmobj@bg.mean, pwmobj@bg.sd, pwmobj@bg.len)
				res$group.norm = sapply(res$group.bg, qlnorm, lower.tail=FALSE)
			}
			
		} else if(score == "clover"){
			# now run Clover on normalized scores
			res$group.bg = cloverScore(res$sequence.norm, verbose=verbose)
		}
	} else if(bg == "z"){
		# lengths of sequences and PWMs
		seq.len = sapply(sequences, length)
		pwm.len = sapply(pwms, length)
		res$sequence.bg = cutoffZscore(res$sequence.nobg, seq.len, pwm.len, pwmobj@bg.P)
		res$group.bg = cutoffZscoreSequenceSet(res$sequence.nobg, seq.len, pwm.len, pwmobj@bg.P)
	} else if(bg == "pval"){
		seq.len = sapply(sequences, length)
		pwm.len = sapply(pwms, length)
		
		# if to use cutoff
		usecutoff = NULL
		if(score == "cutoff")
			usecutoff = cutoff
		
		if(!group.only){
			res$sequence.bg = t(sapply(1:length(seq.len), function(i)
				 empiricalPvalue(res$sequence.nobg[i,], seq.len[i], pwm.len, pwmobj@bg.fwd, pwmobj@bg.rev, cutoff=usecutoff, B=B, verbose=verbose)))
		}
		
		# do empirical P-value
		if(score == "clover")
			res$group.bg = cloverPvalue1seq(res$sequence.nobg, seq.len, pwm.len, pwmobj@bg.fwd, pwmobj@bg.rev, B=B, verbose=verbose, clover=res$group.nobg)
		else	 
			res$group.bg = empiricalPvalueSequenceSet(res$sequence.nobg, seq.len, pwm.len, pwmobj@bg.fwd, pwmobj@bg.rev, cutoff=usecutoff, B=B, verbose=verbose)
	} else if(bg == "ms"){
		# if to use cutoff
		usecutoff = NULL
		if(score == "cutoff")
			usecutoff = cutoff
		res$sequence.bg = matrixShuffleZscorePerSequence(res$sequence.nobg, sequences, pwms, cutoff=usecutoff, B=motif.shuffles)
	} else if(bg == "gev"){
		seq.len = sapply(sequences, length)
		pwm.len = sapply(pwms, length)
		
		res$sequence.bg = gevPerSequence(res$sequence.nobg, seq.len, pwm.len, pwmobj@bg.loc, pwmobj@bg.scale, pwmobj@bg.shape)
		res$group.bg = NULL
	} else {
		stop(paste("Uknown background correction algorithm: '", bg, "', Please select one of: 'none', 'logn', 'z', 'pval', 'ms'", sep=""))
	}
	
	# only store the values for the group, not individual sequences!
	if(group.only){
		seq.n = grep("^sequence[.]", names(res))
		if(length(seq.n)>0){
			res = res[-seq.n]
		}
	}
	
	# restore the sequence names
	if("sequence.nobg" %in% names(res)){
		rownames(res$sequence.nobg) = names(sequences)
	}
	if("sequence.bg" %in% names(res)){
		rownames(res$sequence.bg) = names(sequences)
	}
	
	return(new("MotifEnrichmentResults", res=res))
		
}

#' Obtain z-score for motif column shuffling
#'
#' All PWMs are shuffled at the same time. This function would be too slow to produce
#' empirical P-values, thus we return a z-score from a small number of shuffles. 
#'
#' The z-scores are calculated for each sequence individually. 
#' 
#' @param scores a set of already calculated scores
#' @param sequences either one sequence or a list/set of sequences (objects of type DNAString or DNAStringSet)
#' @param pwms a list of PWMs
#' @param cutoff if NULL, will use affinity, otherwise will use number of hits over this log2 odds cutoff
#' @param B number of replicates, i.e. PWM column shuffles
matrixShuffleZscorePerSequence = function(scores, sequences, pwms, cutoff=NULL, B=30){
	# scores from different replicates for each of the sequences
	scores.matrix = array(0, dim=c(nrow(scores), ncol(scores), B))
	dimnames(scores.matrix) = list(rownames(scores), colnames(scores), NULL)
	
	# perfom shuffles	
	for(i in 1:B){
		cat("Perfoming shuffle", i, "/", B, "\n")
		pwms.shuffled = pwms
		for(j in 1:length(pwms)){
			p = pwms[[j]]
			pwms.shuffled[[j]]@pwm = p@pwm[, sample(ncol(p@pwm))]
		}
		
		scores.suffled = motifScores(sequences, pwms.shuffled, cutoff=cutoff, verbose=FALSE)
		scores.matrix[,,i] = scores.suffled
		
	}
	
	# work out mean and sd
	shuffled.mean = apply(scores.matrix, c(1,2), mean)
	shuffled.sd = apply(scores.matrix, c(1,2), sd)
	
	return( (scores - shuffled.mean) / shuffled.sd )
}


#' Calculate the P-value from lognormal distribution with background of equal length
#' 
#' @param scores affinity scores for the PWMs, can contain scores for more than one sequence (as rows), P-values are extracted separately
#' @param seq.len the length distribution of the sequences
#' @param pwm.len the leggths of PWMs
#' @param bg.mean the mean values from the background for PWMs
#' @param bg.sd the sd values from the background
#' @param bg.len the length distribution of the background (we currently support only constant length)
#' @param log if to produce log p-values
logNormPval = function(scores, seq.len, pwm.len, bg.mean, bg.sd, bg.len, log=FALSE){
	if(is.vector(scores)){
		scores = matrix(scores, nrow=1, dimnames=list(NULL, names(scores)))
	}
	if( length(seq.len) != nrow(scores) ){
		stop("The sequence length distribution and number of rows for scores doesn't match")
	}
	if( !all(colnames(scores) == names(bg.mean)) ){
		stop("The motifs column names in 'scores' and 'bg.mean' do not match")
	}
	
	seq.mean = seq.sd = matrix(0, nrow=nrow(scores), ncol=ncol(scores))

	slen = matrix(seq.len, nrow=nrow(scores), ncol=ncol(scores)) - 
		   matrix(pwm.len, nrow=nrow(scores), ncol=ncol(scores), byrow=TRUE) + 1

	# rows (i) - sequences
	# cols (j) - motifs
	if(is.matrix(bg.mean)){
		# human implementation ... 
		
		bg.len.min = matrix(bg.len[1,], nrow=nrow(scores), ncol=ncol(scores), byrow=TRUE)
		bg.len.max = matrix(bg.len[nrow(bg.len),], nrow=nrow(scores), ncol=ncol(scores), byrow=TRUE)
	
		bg.len.all = lapply(1:nrow(bg.len), function(i) matrix(bg.len[i,], nrow=nrow(scores), ncol=ncol(scores), byrow=TRUE))
	
		# split into three classes
		min.inx = slen <= bg.len.min
		max.inx = slen >= bg.len.max
		int.inx = !min.inx & !max.inx

		# bg.sd correction factor value
		bgsd = bg.sd * sqrt(bg.len/matrix(bg.len[1,], nrow=nrow(bg.len), ncol=ncol(bg.len), byrow=TRUE))
		bgsd = bgsd / matrix(bgsd[1,], nrow=nrow(bgsd), ncol=ncol(bgsd), byrow=TRUE)
	
		bgsd.all = lapply(1:nrow(bgsd), function(i) matrix(bgsd[i,], nrow=nrow(scores), ncol=ncol(scores), byrow=TRUE))

		# smaller
		if(any(min.inx)){
			bg.mean.min = matrix(bg.mean[1,], nrow=nrow(scores), ncol=ncol(scores), byrow=TRUE)
			bg.sd.min = matrix(bg.sd[1,], nrow=nrow(scores), ncol=ncol(scores), byrow=TRUE)

			# keep mean, use quadratic for sd
			seq.mean[min.inx] = bg.mean.min[min.inx]
			seq.sd[min.inx] = bg.sd.min[min.inx] / sqrt(slen[min.inx]/bg.len.min[min.inx])
		} 
		if(any(max.inx)){
			bg.mean.max = matrix(bg.mean[nrow(bg.mean),], nrow=nrow(scores), ncol=ncol(scores), byrow=TRUE)
			bg.sd.max = matrix(bg.sd[nrow(bg.sd),], nrow=nrow(scores), ncol=ncol(scores), byrow=TRUE)

			# keep mean, use quadratic for sd
			seq.mean[max.inx] = bg.mean.max[max.inx]
			seq.sd[max.inx] = bg.sd.max[max.inx] / sqrt(slen[max.inx]/bg.len.max[max.inx])
		}
		if(any(int.inx)){
			bg.mean.all = lapply(1:nrow(bg.mean), function(i) matrix(bg.mean[i,], nrow=nrow(scores), ncol=ncol(scores), byrow=TRUE))
			bg.sd.all = lapply(1:nrow(bg.sd), function(i) matrix(bg.sd[i,], nrow=nrow(scores), ncol=ncol(scores), byrow=TRUE))
			# interpolate!		
			for(k in 1:(nrow(bg.mean)-1)){
				a = bg.len.all[[k]]
				b = bg.len.all[[k+1]]
				inx = int.inx & slen >= a & slen < b
				if(any(inx)){
					# direct linear interpolation for mean
					seq.mean[inx] = bg.mean.all[[k]][inx] + (bg.mean.all[[k+1]][inx] - bg.mean.all[[k]][inx])/(b[inx]-a[inx]) * (slen[inx]-a[inx])
				
					# base value of seq.sd
					seq.sd[inx] = bg.sd.all[[1]][inx] / sqrt(slen[inx]/bg.len.all[[1]][inx])
				
					# this correction factor would ideally be 1
					seq.sd[inx] = seq.sd[inx] * (bgsd.all[[k]][inx] + (bgsd.all[[k+1]][inx]-bgsd.all[[k]][inx])/(b[inx]-a[inx]) * (slen[inx]-a[inx]))
				
				}
			}
		}
	} else {
		# default implementation
		seq.mean = matrix(bg.mean, nrow=nrow(scores), ncol=ncol(scores), byrow=TRUE)
		seq.sd = matrix(bg.sd, nrow=nrow(scores), ncol=ncol(scores), byrow=TRUE) / 
					sqrt(slen / matrix(bg.len, nrow=nrow(scores), ncol=ncol(scores), byrow=TRUE))
	}
	
	# calculate p-values
	seq.p = matrix(0, nrow=nrow(scores), ncol=ncol(scores))
	colnames(seq.p) = colnames(scores)
	rownames(seq.p) = rownames(scores)
	
	# calculate the mean/sd parameters of the lognormal distribution
	seq.ml = 2*log(seq.mean) - 0.5*log(seq.mean^2+seq.sd^2)
	seq.sl = sqrt(-2*log(seq.mean) + log(seq.mean^2+seq.sd^2))
	
	# convert to z-scores so we can streamline... 
	z = (log(scores) - seq.ml) / seq.sl
	seq.p[,] = plnorm(exp(z), meanlog=0, sdlog=1, lower.tail=FALSE, log.p=log)
	
	# convert to log10 P-values
	if(log)
		seq.p = seq.p * log10(exp(1))
		
	seq.p
}

#' Lognormal P-value for a set of sequences
#'
#' @param scores a matrix of per-sequence affinity scores
#' @param seq.len lengths of sequences
#' @param pwm.len lengths of pwms
#' @param bg.mean mean background at length of bg.len
#' @param bg.sd standard deviation of background at length of bg.len
#' @param bg.len the length for which mean and sd are calculated
#' @return P-value
logNormPvalSequenceSet = function(scores, seq.len, pwm.len, bg.mean, bg.sd, bg.len){
	# total score for all the sequences
	s = structure(rep(0, ncol(scores)), names=colnames(scores))
	res = structure(rep(0, ncol(scores)), names=colnames(scores))
	for(i in 1:ncol(scores)){
		# lengths of sequences for this pwm, all of them are shorter by pwm.len-1
		seq.len.pwm = seq.len - pwm.len[i] + 1
		
		# sum over sequences then average by total length
		s[i] = sum(scores[,i] * seq.len.pwm) / sum(seq.len.pwm)
		if(is.matrix(bg.mean)){
			res[i] = logNormPval(s[i], sum(seq.len.pwm)+pwm.len[i]-1, pwm.len[i], bg.mean[,i,drop=FALSE], bg.sd[,i,drop=FALSE], bg.len[,i,drop=FALSE])
		} else {
			res[i] = logNormPval(s[i], sum(seq.len.pwm)+pwm.len[i]-1, pwm.len[i], bg.mean[i], bg.sd[i], bg.len[i])
		}
	}
	
 	return(res)
}


#' Replace all infinite values by 0
#'
#' @param x a vector of values
keepFinite = function(x) {
	if(any(!is.finite(x)))
		x[!is.finite(x)] = 0
	x
}

#' Z-score calculation for cutoff hits
#'
#' The Z-score is calculated separately for each sequence
#'
#' @param scores the hit counts for the sequences
#' @param seq.len the length distribution of sequences
#' @param pwm.len the length distribution of the PWMs
#' @param bg.P background probabilities of observing a motif hit at nucleotide resolution 
#'             (scaled to sequence length, not 2 * length)
#' @return Z-score
cutoffZscore = function(scores, seq.len, pwm.len, bg.P){
	if(is.vector(scores))
		scores = matrix(scores, nrow=1)

	# actual probability is 1/2 because we have two strands
	bg.P = bg.P / 2
	
	z = matrix(0, nrow=nrow(scores), ncol=ncol(scores))
	rownames(z) = rownames(scores)
	colnames(z) = colnames(scores)
	for(i in 1:nrow(scores)){
		# actualy length is 2x because we have two strands
		P.n = 2 * (seq.len[i] - pwm.len + 1)
		P.mean = bg.P * P.n
		P.sd = sqrt(P.n * bg.P * (1-bg.P))
		# so this would be the poisson approximation
		# z[i,] = 1 / ppois(scores[i,], P.mean, lower.tail=FALSE)
		z[i,] = keepFinite( (scores[i,] - P.mean) / P.sd )
	}
	
	return(z)
}

#' Z-score calculation for cutoff hits for group of sequences
#'
#' The Z-score is calculated as if the sequence came for one very long sequence
#'
#' @param scores the hit counts for the sequences
#' @param seq.len the length distribution of sequences
#' @param pwm.len the length distribution of the PWMs
#' @param bg.P background probabilities of observing a motif hit at nucleotide resolution
#' @return Z-score
cutoffZscoreSequenceSet = function(scores, seq.len, pwm.len, bg.P){	
	total.hits = colSums(scores)
	total.len = sum(seq.len) - length(seq.len)*(pwm.len-1) + pwm.len - 1
	
	# do it seperate for each PWM to make sure the lengths are right
	res = sapply(1:length(total.hits), function(i) cutoffZscore(total.hits[i], total.len[i], pwm.len[i], bg.P[i]))
	names(res) = names(total.hits)
	
	return(res)
}

#' Calculate the empirical P-value by affinity of cutoff. 
#'
#' This is the new backend function for empirical P-values for either affinity or cutoff. 
#' The function only works on single sequences. 
#'
#' @param scores the scores obtained for the sequence
#' @param seq.len the length of the sequence, if a single value will take a single sequence
#'                of given length. If a vector of values, will take sequences of given lengths
#'                and joint them together
#' @param pwm.len the lengths of PWMs
#' @param bg.fwd raw odds scores for the forward strand of background
#' @param bg.rev raw odds scores for the reverse strand of background
#' @param cutoff if not NULL, will use hit count above this cutoff. The cutoff should be specified in log2. 
#' @param B the number of random replicates
#' @param verbose if to give verbose progress reports
#' @param exact.length if to take into consideration that the actual sequence lengths differ for different PWMs.
#'                     For very long sequences (i.e. seq.len >> pwm.len) this make very little difference, however
#'                     the run time with exact.length is much longer. 
empiricalPvalue = function(scores, seq.len, pwm.len, bg.fwd, bg.rev, cutoff=NULL, B=10000, verbose=FALSE, exact.length=FALSE){
	# length of sequences
	bg.len = nrow(bg.fwd)
	
	if(max(seq.len) > bg.len){
		stop("The length of sequence greater than background!")
	} 
	
	if(max(seq.len) * B > bg.len){
		warning("The length of the sequence multiplied by number of runs (B) is greater than background length. This might lead to unreliable P-value estimates.")
	}
		
	# the empricial p-value
	score.p = rep(0, length(scores))
		
	for(i in 1:B){
		if(verbose)
			message("Random sample ", i, " / ", B)
		
		# calculate the final score as sum over different seq.len sequences
		final.score = 0
		for(k in 1:length(seq.len)){
			# choose a subsequence with matching length
			start.range = 1:(bg.len - seq.len[k])
			start = sample(start.range, 1)
		
			# fetch the scores from the two strands
			score.fwd = bg.fwd[start:(start+seq.len[k]-1),]
			score.rev = bg.rev[start:(start+seq.len[k]-1),]

			# simulate that we don't have scores for last pwm.len+1 nucleotides
			if(exact.length){
				for(j in 1:length(pwm.len)){
					score.fwd[(nrow(score.fwd)-pwm.len[j]+1):nrow(score.fwd),j] = NA
					score.rev[(nrow(score.rev)-pwm.len[j]+1):nrow(score.rev),j] = NA
				}
			}
			
			# append both strands to one long string		
			score.both = rbind(score.fwd, score.rev)
		
			if(is.null(cutoff)){
				# affinity algorithm
				new.score = colSums(score.both, na.rm=TRUE)
			} else {
				# number of hits above threshold
				new.score = colSums(score.both >= 2^cutoff, na.rm=TRUE)
			}
			final.score = final.score + new.score
		}
		
		# to obtain the mean we divide by twice the length (because it's both strands)
		final.score = final.score / (2 * sum(seq.len)) 
		
		score.p = score.p + ((final.score > scores) + 0)
		
	}
	
	return(score.p / B)
}

#' Empirical P-value for a set of sequences
#'
#' Calculate empirical P-value for a set of sequences, using either affinity or cutoff. When cutoff is used, the score
#' is a number of motif hits above a certain log-odds cutoff. 
#'
#' @param scores a matrix of scores, rows for sequences, columns for PWMs
#' @param seq.len the lengths of sequences
#' @param pwm.len the lengths of PWMs
#' @param bg.fwd raw odds scores for the forward strand of background
#' @param bg.rev raw odds scores for the reverse strand of background
#' @param cutoff if not NULL, will use hit count above this cutoff. The cutoff should be specified in log2. 
#' @param B the number of random replicates
#' @param verbose if to give verbose progress reports
empiricalPvalueSequenceSet = function(scores, seq.len, pwm.len, bg.fwd, bg.rev, cutoff=NULL, B=10000, verbose=FALSE){
	# affinity
	if(is.null(cutoff)){
		# total score for all the sequences
		s = structure(rep(0, ncol(scores)), names=colnames(scores))
		res = structure(rep(0, ncol(scores)), names=colnames(scores))
		exact.lengths = structure(rep(0, ncol(scores)), names=colnames(scores))
		for(i in 1:ncol(scores)){
			# lengths of sequences for this pwm, all of them are shorter by pwm.len-1
			seq.len.pwm = seq.len - pwm.len[i] + 1
			exact.lengths[i] = sum( seq.len.pwm )
		
			# sum over sequences then average by total length
			s[i] = sum(scores[,i] * seq.len.pwm) / sum(seq.len.pwm)			
		}
		
		# approximate sequence length without all the last positions being matched
		total.len = seq.len - round(mean(pwm.len))
		
		# run it on mean sequence length
		res = empiricalPvalue(s, total.len, pwm.len, bg.fwd, bg.rev, cutoff, B, verbose, exact.length=FALSE)
	
	 	return(res)
	} else {
		total.hits = colSums(scores)
		total.len = seq.len - round(mean(pwm.len)) #sum(seq.len) - length(seq.len)*pwm.len
	
		# run ti on mean sequence lengths (with pwm.len)
		res = empiricalPvalue(total.hits, total.len, pwm.len, 
					bg.fwd, bg.rev, cutoff, B, verbose, exact.length=FALSE)
					
		return(res)
	}
}

#' Apply GEV background normalization per every sequence
#' 
#' @param scores affinity scores for the PWMs, can contain scores for more than one sequence (as rows), P-values are extracted separately
#' @param seq.len the length distribution of the sequences
#' @param pwm.len the lengths of PWMs
#' @param bg.loc list of linear regression for location parameter
#' @param bg.scale list of linear regression for scale parameter
#' @param bg.shape list of linear regression for shape parameter
gevPerSequence = function(scores, seq.len, pwm.len, bg.loc, bg.scale, bg.shape){
	if(is.vector(scores)){
		scores = matrix(scores, nrow=1, dimnames=list(NULL, names(scores)))
	}
	if( length(seq.len) != nrow(scores) ){
		stop("The sequence length distribution and number of rows for scores doesn't match")
	}
	
	# calculate the expected sd values for the length distribution for each sequence
	seq.sd = matrix(0, nrow=nrow(scores), ncol=ncol(scores))
	
	log.seqlen = data.frame("log.len"=log(seq.len))
	
	# predict loc, scale, shape for each PWM
	loc = matrix(0, nrow=nrow(scores), ncol=ncol(scores))
	scale = matrix(0, nrow=nrow(scores), ncol=ncol(scores))
	shape = matrix(0, nrow=nrow(scores), ncol=ncol(scores))
	for(i in 1:ncol(scores)){
		loc[,i] = as.vector(predict.lm(bg.loc[[i]], newdata=log.seqlen))
		scale[,i] = as.vector(predict.lm(bg.scale[[i]], newdata=log.seqlen))
		shape[,i] = as.vector(predict.lm(bg.shape[[i]], newdata=log.seqlen))
	}
	
	## work out the P-values
	seq.p = matrix(0, nrow=nrow(scores), ncol=ncol(scores))
	colnames(seq.p) = colnames(scores)
	rownames(seq.p) = rownames(scores)
	
	for(i in 1:nrow(seq.p)){
		for(j in 1:ncol(seq.p)){
			# if scale is <0 the GEV approximation is no longer valid
			if(scale[i,j] < 0)
				seq.p[i,j] = NA
			else
				seq.p[i,j] = pgev(log(scores[i,j]), loc=loc[i,j], scale=scale[i,j], shape=shape[i,j], lower.tail=FALSE)
		}
	}
	
	seq.p
}

#' Calculate total affinity over a set of sequences
#'
#' @param scores affinity scores for individual sequences
#' @param seq.len lengths of sequences
#' @param pwm.len lengths of PWMs
affinitySequenceSet = function(scores, seq.len, pwm.len){
	if(is.vector(scores))
		scores = matrix(scores, nrow=1, dimnames=list(NULL, names(scores)))

	# final scores for the group
	final = structure(rep(0, ncol(scores)), names=colnames(scores))
	
	for(i in 1:ncol(scores)){
		# actual length of scanned sequence
		seq.len.pwm = seq.len - pwm.len[i] + 1
		
		final[i] = sum(scores[,i] * seq.len.pwm) / sum(seq.len.pwm)
	}
	
	return(final)
}

#' Calculate Recovery-AUC for motifs ranked according to some scoring scheme
#'
#' Note that this function asssumes that smaller values are better!
#' 
#' @param seq.res a matrix where each column represents a PWM and each row a result for a different sequence. 
motifRecoveryAUC = function(seq.res){
	# calculate the ranks
	r = t(apply(seq.res, 1, rank))
	
	# previous point needed to calculate AUC-ROC
	auc = rep(0, ncol(r))
	names(auc) = colnames(r)
	
	# do it for all whole steps
	rec = t(sapply(1:ncol(r), function(i) colSums(r<=i)))
	
	list("rec"=rec, "auc"=colSums(rec))
}

#' Calculate PR-AUC for motifs ranked according to some scoring scheme
#'
#' Note that this function asssumes that smaller values are better!
#' 
#' @param seq.res a matrix where each column represents a PWM and each row a result for a different sequence. 
motifPrAUC = function(seq.res){
	# calculate the ranks
	r = t(apply(seq.res, 1, rank))
	
	# previous point needed to calculate AUC-ROC
	auc = rep(0, ncol(r))
	names(auc) = colnames(r)
	
	# do it for all whole steps
	rec = t(sapply(1:ncol(r), function(i) colSums(r<=i)))
	
	recall = rec / nrow(r)
	prec = recall / 1:nrow(rec)
	
	list("prec"=prec, "recall"=recall)
}