R/pwm.R

Defines functions motifPrAUC motifRecoveryAUC affinitySequenceSet gevPerSequence empiricalPvalueSequenceSet empiricalPvalue cutoffZscoreSequenceSet cutoffZscore keepFinite logNormPvalSequenceSet logNormPval matrixShuffleZscorePerSequence motifEnrichment motifIC motifScoresBigMemory motifScores .inputParamSequences .inputParamMotifs toPWM PFMtoPWM scanWithPWM PWMUnscaled .normargPfm .normargPriorParams

Documented in affinitySequenceSet cutoffZscore cutoffZscoreSequenceSet empiricalPvalue empiricalPvalueSequenceSet gevPerSequence .inputParamMotifs .inputParamSequences keepFinite logNormPval logNormPvalSequenceSet matrixShuffleZscorePerSequence motifEnrichment motifIC motifPrAUC motifRecoveryAUC motifScores motifScoresBigMemory .normargPfm .normargPriorParams PFMtoPWM PWMUnscaled scanWithPWM toPWM

# Globals from Biostrings
DNA_BASES = c("A", "C", "G", "T")
DNA_ALPHABET = c("A", "C", "G", "T", "M", "R", "W", "S", "Y", "K", "V", "H", "D", "B", "N", "-", "+")

#' Input parameter normalization function for PWMUnscaled
#'
#' This function is from Biostrings package
#'
#' @param prior.params Typical 'prior.params' vector: c(A=0.25, C=0.25, G=0.25, T=0.25)
.normargPriorParams <- function(prior.params)
{
    if (!is.numeric(prior.params))
        stop("'prior.params' must be a numeric vector")
    if (length(prior.params) != length(DNA_BASES) ||
        !setequal(names(prior.params), DNA_BASES))
        stop("'prior.params' elements must be named A, C, G and T")
    ## Re-order the elements.
    prior.params <- prior.params[DNA_BASES]
    if (any(is.na(prior.params)) || any(prior.params < 0))
        stop("'prior.params' contains NAs and/or negative values")
    prior.params
}

#' Input parameter normalization for PWMUnscaled
#'
#' This function is from Biostrings package.
#' A Position Frequency Matrix (PFM) is also represented as an ordinary
#' matrix. Unlike a PWM, it must be of type integer (it will typically be
#' the result of consensusMatrix()).
#'
#' @param x a frequency matrix
.normargPfm <- function(x)
{
    if (!is.matrix(x) || !is.integer(x))
        stop("invalid PFM 'x': not an integer matrix")
    ## Check the row names.
    if (is.null(rownames(x)))
        stop("invalid PFM 'x': no row names")
    if (!all(rownames(x) %in% DNA_ALPHABET))
        stop("invalid PFM 'x': row names must be in 'DNA_ALPHABET'")
    if (!all(DNA_BASES %in% rownames(x)))
        stop("invalid PFM 'x': row names must contain A, C, G and T")
    if (any(duplicated(rownames(x))))
        stop("invalid PFM 'x': duplicated row names")
    ## Check the nb of cols.
    if (ncol(x) == 0L)
        stop("invalid PFM 'x': no columns")
    ## Check the values.
    if (any(is.na(x)) || any(x < 0L))
        stop("invalid PFM 'x': values cannot be NA or negative")
    if (any(x[!(rownames(x) %in% DNA_BASES), ] != 0L))
        stop("invalid PFM 'x': IUPAC ambiguity letters are represented")
    x <- x[DNA_BASES, , drop=FALSE]
    x
}

#' The PWM function from Biostrings without unit scaling
#'
#' By default the Biostrings package scales the log-odds score so it is within 0 and 1. In this function
#' we take a more traditional approach with no unit scaling and offer unit scaling as an additional parameter.
#'
#' See ?PWM from Biostrings for more information on input arguments. 
#'
#' @title Create a PWM from PFM
#' @param x the integer count matrix representing the motif, rows as nucleotides
#' @param id a systematic ID given to this PWM, could include the source, version, etc
#' @param name the name of the transcription factor (TF) to which the PWM corresponds to
#' @param type the type of PWM calculation, either as log2-odds, or posterior probability (frequency matrix)
#' @param prior.params the pseudocounts for each of the nucleotides 
#' @param pseudo.count the pseudo-count values if different from priors
#' @param unit.scale if to unit.scale the pwm (default is no unit scaling)
#' @param seq.count if x is a normalised PFM (i.e. with probabilities instead of sequence counts), then this sequence count
#'                  will be used to convert \code{x} into a count matrix
#' @return a new PWM object representing the PWM 
#' @export
#' @examples
#'
#' if(requireNamespace("PWMEnrich.Dmelanogaster.background")){
#'    data(MotifDb.Dmel.PFM, package = "PWMEnrich.Dmelanogaster.background")
#'
#'    ttk = MotifDb.Dmel.PFM[["ttk"]]
#'    
#'    # make a PWM with uniform background
#'    PWMUnscaled(ttk, id="ttk-JASPAR", name="ttk")
#'    
#'    # custom background
#'    PWMUnscaled(ttk, id="ttk-JASPAR", name="ttk", 
#'      prior.params=c("A"= 0.2, "C" = 0.3, "G" = 0.3, "T" = 0.2))
#'
#'    # get background for drosophila (quick mode on a reduced dataset)
#'    prior = getBackgroundFrequencies("dm3", quick=TRUE)
#'    
#'    # convert using genomic background
#'    PWMUnscaled(ttk, id="ttk-JASPAR", name="ttk", prior.params=prior)
#' }
#'
PWMUnscaled = function(x, id="", name="", type=c("log2probratio", "prob"), prior.params=c(A=0.25, C=0.25, G=0.25, T=0.25), pseudo.count=prior.params, 
	unit.scale=FALSE, seq.count=NULL){
	
	# convert to PFM if needed
	if(!is.null(seq.count)){
		x = apply(round(x * seq.count), 1:2, as.integer)
	}
	
	# match input params
    x <- .normargPfm(x)
    nseq <- colSums(x)
    type <- match.arg(type)
    prior.params <- .normargPriorParams(prior.params)
    
    # total prior added
    priorN <- sum(prior.params)
    
    # posterior probabilities with unequal number of counts per row
    postProbs <- divideRows(x + prior.params, nseq + priorN)
    rownames(postProbs) = rownames(x)
    colnames(postProbs) = colnames(x)
    
    # calculate logodds, or return probabilities
    if (type == "log2probratio") {
        if (any(prior.params == 0)) 
            stop("infinite values in PWM due to 0's in 'prior.params'")
        prior.probs <- prior.params/priorN
        ans <- log2(postProbs/prior.probs)
    }
    else {
        ans <- postProbs
    }
    
    # if to apply unit.scale
    if(unit.scale)
	   	ans = unitScale(ans)
	   	
	# return an object 
	new("PWM", id=id, name=name, pfm=x, prior.params=prior.params, pwm=ans)
}

#' Scan the whole sequence on both strands
#'
#' The whole sequence is scanned with a PWM and scores returned beginning at each position. Partial motif
#' matches are not done, thus the last #[length of motif]-1 scores are NA. 
#' 
#' The function returns either an odds average (*not* log-odds average), maximal score on each strand,
#' or scores on both strands. 
#'
#' The function by default returns the score in log2 following the package \code{Biostrings}. 
#'
#' @param pwm PWM object
#' @param dna a DNAString or other sequence from Biostrings
#' @param pwm.rev the reverse complement for a pwm (if it is already pre-computed)
#' @param odds.score if to return raw scores in odds (not logodds) space
#' @param both.strands if to return results on both strands
#' @param strand.fun which function to use to summarise values over two strands (default is "mean")
#' @return a vector representing scores starting at each position, or a matrix with score in the two strands
#' @export
#' @examples
#' 
#' if(requireNamespace("PWMEnrich.Dmelanogaster.background")){
#'    data(MotifDb.Dmel, package = "PWMEnrich.Dmelanogaster.background")
#'
#'    ttk = MotifDb.Dmel[["ttk"]]
#'
#'    # odds average over the two strands expressed as log2-odds
#'    scanWithPWM(ttk, DNAString("CGTAGGATAAAGTAACT"))
#'    
#'    # log2-odds scores on both strands
#'    scanWithPWM(ttk, DNAString("CGTAGGATAAAGTAACT"), both.strands=TRUE)
#' }
#' 
scanWithPWM = function(pwm, dna, pwm.rev=NULL, odds.score=FALSE, both.strands=FALSE, strand.fun="mean"){
	if(is.character(dna))
		dna = DNAString(dna)

	if(!inherits(dna, c("DNAString", "DNAStringSet")))
		stop("The input sequence needs to be either of type DNAString or DNAStringSet")
	

	if(is.null(pwm.rev))
		pwm.rev = reverseComplement(pwm)
		
	# extract only the PWM matrices
	pwm = pwm$pwm
	pwm.rev = pwm.rev$pwm
	
	if(length(dna) < ncol(pwm)){
		stop("DNA sequence needs to be at least as long as the PWM")
	}

	fwd.motif = PWMscoreStartingAt(pwm, dna, starting.at=1:(length(dna)-ncol(pwm)+1))
	back.motif = PWMscoreStartingAt(pwm.rev, dna, starting.at=1:(length(dna)-ncol(pwm)+1))
	
	if(both.strands){
		res = cbind(fwd.motif, back.motif)
		colnames(res) = c("fwd", "rev")
		res = rbind(res, matrix(NA, nrow=(ncol(pwm)-1), ncol=2))
	} else {
		if(strand.fun == "max"){
			res = apply(cbind(fwd.motif, back.motif), 1, max)
		} else if(strand.fun == "mean"){
			res = 2^cbind(fwd.motif, back.motif)
			res = log2(rowMeans(res))
		} else {
			stop("Unknown strand function", strand.fun, ". Valid values are: mean, max")
		}
		# pad out the result so it is of same length as input sequence
		res = c(res, rep(NA, ncol(pwm)-1))
	}
	
	if(odds.score)
		return(2^res)
	else
		return(res)
}

#' Convert frequencies into motifs using PWMUnscaled
#'
#' Note that this function is deprecated and replaced by \code{toPWM()}. 
#'
#' @param motifs a list of motifs represented as matrices of frequencies (PFM)
#' @param id the set of IDs for the motifs (defaults to names of the 'motifs' list)
#' @param name the set of names for the motifs (defaults to names of the 'motifs' list)
#' @param seq.count if frequencies in the motifs are normalized to 1, provides a vector of sequence counts (e.g. for MotifDb motifs)
#' @param ... other parameters to PWMUnscaled
#'
#' @export
#' @examples
#' \dontrun{
#' if (requireNamespace("PWMEnrich.Dmelanogaster.background")) {
#'   data(MotifDb.Dmel.PFM, package = "PWMEnrich.Dmelanogaster.background")
#'
#'   # convert to PWM with uniform background
#'   PFMtoPWM(MotifDb.Dmel.PFM)
#'
#'   # get background for drosophila (quick mode on a reduced dataset)
#'   prior = getBackgroundFrequencies("dm3", quick=TRUE)
#'    
#'   # convert with genomic background 
#'   PFMtoPWM(MotifDb.Dmel.PFM, prior.params=prior)
#' }
#' }
PFMtoPWM = function(motifs, id=names(motifs), name=names(motifs), seq.count=NULL, ...){
	if(!is.list(motifs)){
		was.list = FALSE
		motifs = list(motifs)
	} else {
		was.list = TRUE
	}
	
	if(is.null(id))
		id = rep("", length(motifs))
	if(is.null(name))
		name = rep("", length(motifs))
	
	if(length(id) != length(motifs))
		stop("The number of IDs (parameter 'id') need to be the same as number of motifs (parameter 'motifs')")

	if(length(name) != length(motifs))
		stop("The number of names (parameter 'name') need to be the same as number of motifs (parameter 'motifs')")
		
	if(!is.null(seq.count) && length(seq.count) != length(motifs)){
		stop("The 'seq.count' vector needs to be of the same length as the list of motifs (parameters 'motifs')")
	}
	
	# call PWMUnscaled
	res = list()
	for(i in 1:length(motifs)){
		if(is.null(seq.count)){
			res[[i]] = PWMUnscaled(motifs[[i]], id=id[i], name=name[i], ...)
		} else {
			res[[i]] = PWMUnscaled(motifs[[i]], id=id[i], name=name[i], seq.count=seq.count[i], ...)
		}
	}
	
	names(res) = names(motifs)
	
	# convert back to single object if that's how the input was
	if(!was.list){
		return(res[[1]])
	} else {
		return(res)
	}

}

#' Convert motifs into PWMs
#'
#' @param motifs a list of motifs either as position probability matrices (PPM) or frequency matirces (PFMs)
#' @param ids the set of IDs for the motifs (defaults to names of the 'motifs' list)
#' @param targets the set of target TF names for the motifs (defaults to names of the 'motifs' list)
#' @param seq.count provides a vector of sequence counts for probability matrices (PPMs). Default it 50. 
#' @param prior frequencies of the four letters in the genome. Default is uniform background.
#' @param ... other parameters to PWMUnscaled
#'
#' @export
#' @examples
#' \dontrun{
#' if (requireNamespace("PWMEnrich.Dmelanogaster.background")) {
#'    data(MotifDb.Dmel.PFM, package = "PWMEnrich.Dmelanogaster.background")
#'
#'    toPWM(MotifDb.Dmel.PFM) # convert to PWM with uniform background
#'
#'    # get background for drosophila (quick mode on a reduced dataset)
#'    prior = getBackgroundFrequencies("dm3", quick=TRUE)
#'    toPWM(MotifDb.Dmel.PFM, prior=prior) # convert with genomic background 
#' }
#' }
toPWM = function(motifs, ids=names(motifs), targets=names(motifs), seq.count=50, prior=c(A=0.25, C=0.25, G=0.25, T=0.25), ...){
	if(!is.list(motifs)){
		was.list = FALSE
		motifs = list(motifs)
	} else {
		was.list = TRUE
	}
	
	# extend seq.count if needed
	if(length(seq.count) == 1 && length(seq.count) != length(motifs))
		seq.count = rep(seq.count, length(motifs))
	
	# convert each individual motif into integers or multiply with seq.count
	for(i in 1:length(motifs)){
		m = motifs[[i]]
		
		mi = apply(m, 1:2, as.integer)
		if(all(m == mi)){
			# can be coersced to an integer matrix, use that
			motifs[[i]] = mi
		} else {
			# if the values is NA use 50 as default
			if(is.na(seq.count[i])){
				motifs[[i]] = apply(round(m * 50), 1:2, as.integer)
			} else {
				motifs[[i]] = apply(round(m * seq.count[i]), 1:2, as.integer)
			}
		}
		
	}
	
	if(is.null(ids))
		ids = rep("", length(motifs))
	if(is.null(targets))
		targets = rep("", length(motifs))
	
	if(length(ids) != length(motifs))
		stop("The number of IDs (parameter 'ids') need to be the same as number of motifs (parameter 'motifs')")

	if(length(targets) != length(motifs))
		stop("The number of TF target names (parameter 'targets') need to be the same as number of motifs (parameter 'motifs')")
	
	# call PWMUnscaled
	res = list()
	for(i in 1:length(motifs)){
		res[[i]] = PWMUnscaled(motifs[[i]], id=ids[i], name=targets[i], prior.params=prior, ...)
	}
	
	names(res) = names(motifs)
	
	# convert back to single object if that's how the input was
	if(!was.list){
		return(res[[1]])
	} else {
		return(res)
	}

}

#' Normalizes the motifs input argument for multiple functions
#'
#' @param motifs a list of motifs either as frequency matrices (PFM) or as PWM objects. If PFMs are specified
#'               they are converted to PWMs using uniform background. 
.inputParamMotifs = function(motifs){
	# check motifs format and convert to PWM
	if(!is.list(motifs))
		motifs = list(motifs)
		
	if(is.matrix(motifs[[1]]))
		pwms = PFMtoPWM(motifs)
	else if(inherits(motifs[[1]], "PWM"))
		pwms = motifs
	else
		stop("motifs need to be either frequency matrices or PWM objects")
		
	return(pwms)
}

#' Normalize the sequences input argument
#'
#' @param sequences a set of sequences to be scanned, a list of DNAString or other scannable objects
.inputParamSequences = function(sequences){
	if(is.character(sequences)){
		sequences = readDNAStringSet(sequences)
	}
	# make sure sequences are in the right format
	if(!is.list(sequences) & !inherits(sequences, "DNAStringSet"))
		sequences = list(sequences)
	
	if(is.list(sequences) & length(sequences)>0){
		sequences = lapply(sequences, function(s){
			if(is.character(s))
				DNAString(s)
			else
				s
		})
	}
		
	#if(class(sequences) == "DNAStringSet")
	#	sequences = DNAStringSetToList(sequences)
		
	return(sequences)
}

#' Motif affinity or number of hits over a threshold
#'
#' Scan a number of sequences either to find overall affinity, or a number of hits over a score threshold. 
#'
#' @param sequences a set of sequences to be scanned, a list of DNAString or other scannable objects
#' @param motifs a list of motifs either as frequency matrices (PFM) or as PWM objects. If PFMs are specified
#'               they are converted to PWMs using uniform background. 
#' @param raw.scores if to return raw scores (odds) for each position in the sequence. Note that scores for forward and reverse 
#'                   strand are concatenated into a single long vector of scores (twice the length of the sequence)
#' @param verbose if to print verbose output
#' @param cutoff if not NULL, will count number of matches with score above value specified (instead of returning the average affinity). 
#'               Can either be one value, or a vector of values for each of the motifs. 
#' @return if raw.scores=FALSE, returns a matrix of mean scores (after cutoff if any), where columns are motifs. 
#'         The returned values are either mean odd scores (not log-odd), or number of hits above a threshold;
#'         otherwise if raw.scores=TRUE, returns a list of raw score values (before cutoff)
#' @export
#' @examples
#'
#' if(requireNamespace("PWMEnrich.Dmelanogaster.background")){
#'    data(MotifDb.Dmel, package = "PWMEnrich.Dmelanogaster.background")
#'
#'    # affinity scores
#'    affinity = motifScores(DNAString("CGTAGGATAAAGTAACTAGTTGATGATGAAAG"), MotifDb.Dmel)
#'    
#'    # motif hit count with Patser score of 4
#'    counts = motifScores(DNAString("CGTAGGATAAAGTAACTAGTTGATGATGAAAG"), MotifDb.Dmel, 
#'      cutoff=log2(exp(4)))
#'    
#'    print(affinity)
#'    print(counts)
#'
#'    # scanning multiple sequences
#'    sequences = list(DNAString("CGTAGGATAAAGTAACTAGTTGATGATGAAAG"), 
#'      DNAString("TGAGACGAAGGGGATGAGATGCGGAAGAGTGAAA"))
#'    affinity2 = motifScores(sequences, MotifDb.Dmel)
#'    print(affinity2)
#' }
motifScores = function(sequences, motifs, raw.scores=FALSE, verbose=TRUE, cutoff=NULL){	
	if(!is.null(.PWMEnrich.Options[["useBigMemory"]]) && .PWMEnrich.Options[["useBigMemory"]])
		return(motifScoresBigMemory(sequences, motifs, raw.scores, verbose, cutoff))

	# check motifs format and convert to PWM
	pwms = .inputParamMotifs(motifs)
	sequences = .inputParamSequences(sequences)
	
	# check if any of the sequences is all Ns, if so, produce an error
	if(is(sequences, "DNAStringSet")){
		al = alphabetFrequency(sequences)
	} else {
		al = do.call("rbind", lapply(sequences, alphabetFrequency))
	}
	
	# check if any of the sequences is all Ns
	all.n = which(al[,"N"] == rowSums(al))
	if(length(all.n) >0){
		stop(paste("Sequence with index(es):", paste(all.n, collapse=","), "only contain N's. Please remove this sequence as the PWM score cannot be computed for any position."))
	}
		
	pwms.rev = lapply(pwms, reverseComplement)
	
	# always unfold cutoff into a vector of values
	if(!is.null(cutoff)){
		if(length(cutoff) == 1){
			cutoff = rep(cutoff, length(pwms))
		} else if(length(cutoff) != length(pwms)) {
			stop("The lengths of cutoff and pwms do not match. Either provide one values for cutoff, or a vector of values, one for each PWM")
		} 
	}
	
	# inner loop function to use for parallel processing
	motifScoresLoop = function(i){
		if(verbose)
			message(paste("Scanning sequence", i, "/", length(sequences)))
		s = sequences[[i]]
		
		# if using raw scores, record matrix of values;
		# otherwise a single final value for each pwm
		if(raw.scores){
			r = matrix(0, nrow=length(s)*2, ncol=length(pwms))
			colnames(r) = names(pwms)
		} else {
			r = rep(0, length(pwms))
			names(r) = names(pwms)
		}
		
		for(j in 1:length(pwms)){
			# if we are interested in counts, use max over two strands
			
			if(raw.scores){
				# record raw scores that are averages over strands
				r[,j] = as.vector(scanWithPWM(pwms[[j]], s, pwms.rev[[j]], odds.score=TRUE, both.strands=TRUE))
			} else if(!is.null(cutoff)){
				# count number of hits on both strands
				r.pwm = scanWithPWM(pwms[[j]], s, pwms.rev[[j]], odds.score=FALSE, both.strands=TRUE)
				r[j] = sum(r.pwm >= cutoff[j], na.rm=TRUE)
			} else {
				# do average over both strands and also over all values
				r[j] = mean(scanWithPWM(pwms[[j]], s, pwms.rev[[j]], odds.score=TRUE), na.rm=TRUE)
			}
		}
		
		r
	}
	
	####################################### END OF INNER LOOP ################

	# either do it parallel or serial
	if(!is.null(.PWMEnrich.Options[["numCores"]])){
		cat("Parallel scanning with", .PWMEnrich.Options[["numCores"]], "cores\n")
		# do it in parallel
		res = parallel::mclapply(1:length(sequences), motifScoresLoop, mc.cores = .PWMEnrich.Options[["numCores"]])
		
		if(is.list(res)){
			if( any(sapply(res, is.null)) ){
				stop("Parallel scanning failed for some sequences. This could be due to a number of reasons including not enough memory.")
			}
		}
	} else {
		# do it serial
		res = lapply(1:length(sequences), motifScoresLoop)
	}
	
	## return either raw scores, or counts or means
	if(raw.scores){
		names(res) = names(sequences)
		return(res)
	} else {
		if(length(pwms) == 1){
			return( matrix(sapply(res, identity), ncol=1, dimnames=list(NULL, names(pwms))) )
		} else {
			r = t(sapply(res, identity))
			rownames(r) = names(sequences)
			colnames(r) = names(pwms)
			return( r )
		}
	}
}

#' This is a memory intensive version of motifScore() which is about 2 times faster
#'
#' The parameters and functionality are the same as \code{\link{motifScores}}. Please refer to documentation of this function
#' for detailed explanation of functionality. 
#'
#' This function is not meant to be called directly, but is indirectly called by motifScores() once a global parameters useBigMemory is set. 
#'
#' @param sequences set of input sequences
#' @param motifs set of input PWMs or PFMs
#' @param raw.scores if to return scores for each base-pair
#' @param verbose if to produce verbose output
#' @param cutoff the cutoff for calling binding sites (in base 2 log). 
#' @param seq.all already concatenated sequences if already available (used to internally speed up things)
#'
#' @seealso \code{\link{motifScores}}
motifScoresBigMemory = function(sequences, motifs, raw.scores=FALSE, verbose=TRUE, cutoff=NULL, seq.all=NULL){	
	# check motifs format and convert to PWM
	pwms = .inputParamMotifs(motifs)
	sequences = .inputParamSequences(sequences)
		
	pwms.rev = lapply(pwms, reverseComplement)
	pwms.len = sapply(pwms, length)
	min.pwm.len = min(pwms.len)
	
	# always unfold cutoff into a vector of values
	if(!is.null(cutoff)){
		if(length(cutoff) == 1){
			cutoff = rep(cutoff, length(pwms))
		} else if(length(cutoff) != length(pwms)) {
			stop("The lengths of 'cutoff' and 'pwms' do not match. Either provide one values for 'cutoff' or a vector of values, one for each PWM")
		} 
		
		# cutoff in odds space
		cutoff.2 = 2^cutoff
	}
	
	# work on a single sequence that is all concatenated together because it's faster
	if(is.null(seq.all))
		seq.all = DNAString(concatenateSequences(sequences))
	seq.len = sapply(sequences, length)
	
	# a common error with shorter sequences
	shorter.seq = which(seq.len < min.pwm.len)
	if(length(shorter.seq)>0){
		stop(paste(length(shorter.seq), "sequence(s) have length shorter than the shortest PWM (", min.pwm.len, "), please identify and remove these sequences."))
	}
	
	# a grouping data frame
	seq.len.sum = cumsum(seq.len)
	seq.group = data.frame("from"=rep(0, length(sequences)), "to"=0)
	seq.group$from = c(1, 1+seq.len.sum[-length(seq.len.sum)])
	seq.group$to = seq.len.sum
	
	# inner loop function to use for parallel processing
	#
	# @param motif.start the index of the first motif
	# @param motif.end the index of the last motif 
	motifScoresLoop = function(input.params){
		motif.start = input.params[[1]]
		motif.end = input.params[[2]]
	
		if(verbose){
			if(motif.start == motif.end){
				message(paste("Starting scanning with motif", names(pwms)[motif.start]))
			} else {
				message(paste("Starting scanning with motifs from", motif.start, "to", motif.end))
			}
		}
		# local copy of the sequences and motifs used in the function
		s = seq.all
		s.group = seq.group
		pwms.l = pwms[motif.start:motif.end]
		pwms.rev.l = pwms.rev[motif.start:motif.end]
		pwms.len.l = pwms.len[motif.start:motif.end]
		if(!is.null(cutoff))
			cutoff.2.l = cutoff.2[motif.start:motif.end]
		
		# create the output structure
		res = list()
		for(i in 1:nrow(s.group)){
			if(raw.scores){			
				r = matrix(0, nrow=seq.len[i]*2, ncol=length(pwms.l))
				colnames(r) = names(pwms.l)
				res[[i]] = r		
			} else {
				r = rep(0, length(pwms.l))
				names(r) = names(pwms.l)		
				res[[i]] = r
			}
		}
		
		### do the scanning, convert back to individual sequences and do the post-processing
		for(j in 1:length(pwms.l)){
			if(verbose){
				message(paste("Scanning all sequences with motif", j, "/", length(pwms.l)))
			}
			raw = as.vector(scanWithPWM(pwms.l[[j]], s, pwms.rev.l[[j]], odds.score=TRUE, both.strands=TRUE))
			
			### split the raw results by sequence
			for(i in 1:nrow(s.group)){
				# selector for this sequence
				from = seq.group$from[i]
				to = seq.group$to[i]
				from.rev = length(raw)/2 + seq.group$from[i]
				to.rev = length(raw)/2 + seq.group$to[i]
				sel = c(from:to, from.rev:to.rev)
			
				r = raw[sel]
				
				# fwd strand
				r[((seq.len[i]-pwms.len.l[j])+2):seq.len[i]] = NA
				# rev strand
				r[((2*seq.len[i]-pwms.len.l[j])+2):(2*seq.len[i])] = NA

				# save the values				
				if(raw.scores){
					res[[i]][,j] = r
				} else if(!is.null(cutoff)){
					# count number of hits on both strands 
					res[[i]][j] = sum(r >= cutoff.2.l[j], na.rm=TRUE)
				} else {
					# MeanAffinity
					res[[i]][j] = mean(r, na.rm=TRUE)
				}
				
			}
		}
				
		res
	}
	######################################## END OF INNER LOOP ######################################

	# either do it parallel or serial
	if(!is.null(.PWMEnrich.Options[["numCores"]]) && length(pwms) >= .PWMEnrich.Options[["numCores"]]){
		cores = .PWMEnrich.Options[["numCores"]]
		sel = round(seq(0, length(pwms), length.out=cores+1))
		start = sel[1:cores]+1
		end = sel[2:(cores+1)]
		input = lapply(1:cores, function(i) list(start[i], end[i]))
		# do it in parallel
		res.parallel = parallel::mclapply(input, motifScoresLoop, mc.cores = cores)
		
		if(is.list(res.parallel)){
			if( any(sapply(res.parallel, is.null)) ){
				stop("Parallel scanning failed for some sequences. This could be due to a number of reasons including not enough memory.")
			}
		}	

		# concatenate results for different motifs		
		if(raw.scores)
			res = lapply(1:length(sequences), function(i) do.call("cbind", lapply(res.parallel, function(x) x[[i]])))	
		else
			res = lapply(1:length(sequences), function(i) do.call("c", lapply(res.parallel, function(x) x[[i]])))
	} else {
		# do it serial
		res = motifScoresLoop(list(1, length(pwms)))
	}
	
	
	
	## return either raw scores, or counts or means
	if(raw.scores){
		names(res) = names(sequences)
		return(res)
	} else {
		if(length(pwms) == 1){
			return( matrix(sapply(res, identity), ncol=1, dimnames=list(names(sequences), names(pwms))) )
		} else {
			r = t(sapply(res, identity))
			rownames(r) = names(sequences)
			colnames(r) = names(pwms)
			return( r )
		}
	}
}


#' Information content for a PWM or PFM
#'
#' @param motif a matrix of frequencies, or a PWM object
#' @param prior.params the prior parameters to use when a matrix is given (ignored if motif is already a PWM)
#' @param bycol if to return values separately for each column
#' @return information content in bits (i.e. log2)
#' @export
#' @examples
#'
#' if(requireNamespace("PWMEnrich.Dmelanogaster.background")){
#'    data(MotifDb.Dmel, package = "PWMEnrich.Dmelanogaster.background")
#'    data(MotifDb.Dmel.PFM, package = "PWMEnrich.Dmelanogaster.background")
#'
#'    # the nucleotide distribution is taken from the PWM (in this case genomic background)
#'    motifIC(MotifDb.Dmel[["ttk"]]) 
#'    # information content with default uniform background because the input is a matrix, 
#'    # not PWM object
#'    motifIC(MotifDb.Dmel.PFM[["ttk"]]) 
#' }
motifIC = function(motif, prior.params=c(A=0.25, C=0.25, G=0.25, T=0.25), bycol=FALSE){
	if(inherits(motif, "PWM")) {
		p = PWMUnscaled(motif$pfm, type="prob", prior.params=motif$prior.params)
		bg = motif$prior.params
	} else {
		p = PWMUnscaled(motif, type="prob", prior.params=prior.params)
		bg = prior.params
	}
		
	bg = bg/sum(bg) # normalize to 1
	
	cols = p$pwm * log2(p$pwm/bg)
	if(bycol)
		return(colSums(cols))
	else
		return(sum(cols))
}

#' Calculate motif enrichment using one of available scoring algorithms and background corrections. 
#'
#' This function provides and interface to all algorithms available in PWMEnrich
#' to find motif enrichment in a single or a group of sequences with/without
#' background correction. 
#'
#' Since for all algorithms the first step involves calculating raw scores without background correction, the output
#' always contains the scores without background correction together with (optional) background-corrected
#' scores. 
#'
#' Unless otherwise specified the scores are returned both separately for each sequence (without/with background) and
#' for the whole group of sequences (without/with background). 
#'
#' To use a background correction you need to supply a set of PWMs with precompiled background distribution parameters
#' (see function \code{\link{makeBackground}}). When such an object is supplied as the \code{pwm} parameter, the scoring 
#' scheme and background correction are automatically determined. 
#'
#' There are additional packages with already pre-computed background (e.g. see package \code{PWMEnrich.Dmelanogaster.background}).
#'
#' Please refer to (Stojnic & Adryan, 2012) for more details on the algorithms. 
#'
#' @title Motif enrichment
#' @param sequences the sequences to be scanned for enrichment. Can be either a single sequence 
#'        (an object of class DNAString), or a list of DNAString objects, or a DNAStringSet object.
#' @param pwms this parameter can take multiple values depending on the scoring scheme and background correction used.
#'             When the \code{method} parameter is set to "autodetect", the following default algorithms
#'             are going to be used: 
#'        \itemize{
#'           \item if \code{pwms} is a list containing either frequency matrices or a list of PWM objects then
#'                 the "affinity" algorithm is selected. If frequency matrices are given, they are converted 
#'                 to PWMs using uniform background. For best performance, convert frequency matrices to PWMs
#'                 before calling this function using realistic genomic background. 
#'           \item Otherwise, appropriate scoring scheme and background correction are selected based on the
#'                 class of the object (see below). 
#'        }
#' @param score this parameter determines which scoring scheme to use. Following scheme as available:
#'        \itemize{
#'           \item "autodetect" - default value. Scoring method is determined based
#'                 on the type of \code{pwms} parameter. 
#'           \item "affinity" - use threshold-free affinity score. The \code{pwms}
#'                 parameter can either be a list of frequency matrices, \code{PWM} objects, or a 
#'                 \code{PWMLognBackground} object. 
#'           \item "cutoff" - use number of motif hits above a score cutoff. The \code{pwms}
#'                 parameter can either be a list of frequency matrices, \code{PWM} objects, or a 
#'                 \code{PWMCutoffBackground} object.
#'           \item "clover" - use the Clover algorithm (Frith et al, 2004). The Clover score of a single 
#'                 sequence is identical to the affinity score, while for a group of sequences is an
#'                 average of products of affinities over all sequence subsets. 
#'         }
#'            
#' @param bg this parameter determines how the raw score is compared to the background distribution. 
#'        \itemize{
#'           \item "autodetect" - default value. Background correction is determined based on the type
#'                  of the \code{pwms} parameter. 
#'           \item "logn" - use a lognormal distribution background pre-computed for a set of PWMs.
#'                 This requires \code{pwms} to be of class \code{PWMLognBackground}.
#'           \item "z" - use a z-score for the number of significant motif hits compared to background number of hits.
#'                 This requires \code{pwms} to be of class \code{PWMCutoffBackground}.
#'           \item "pval" - use empirical P-value based on a set of background sequences. This requires
#'                 \code{pwms} to be of class \code{PWMEmpiricalBackground}. Note that PWMEmpiricalBackground
#'                 objects tend to be very large so that the empirical P-value can be calculated in reasonable time.
#'           \item "ms" - shuffle columns of motif matrices and use that as basis for P-value calculation. Note that
#'                  since the sequences need to rescanned with all of the new shuffled motifs this can be very slow. 
#'                  Also, this also works only no *individual* sequences, not groups. 
#'           \item "none" - no background correction
#'         }
#' @param cutoff the score cutoff for a significant motif hit if scoring scheme "cutoff" is selected. 
#' @param verbose if to print verbose output
#' @param motif.shuffles number of times to shuffle motifs if using "ms" background correction
#' @param B number of replicates when calculating empirical P-value
#' @param group.only if to return statistics only for the group of sequences, not individual sequences. In the case of
#'                   empirical background the P-values for individual sequences are not calculated (thus saving time), for other 
#'                   backgrounds they are calculated but not returned.  
#' @return a MotifEnrichmentResults object containing a subset following elements:
#'         \itemize{
#'           \item "score" - scoring scheme used
#'           \item "bg" - background correction used
#'           \item "params" - any additional parameters
#'			 \item "sequences" - the set of sequences used
#'           \item "pwms" - the set of pwms used
#'           \item "sequence.nobg" - per-sequence scores without any background correction. 
#'                   For "affinity" and "clover" a matrix of mean affinity scores; for
#'                   "cutoff" number of significant hits above a cutoff 
#'           \item "sequence.bg" - per-sequence scores after background correction. For "logn" and "pval" the P-value (smaller is better);
#'                                 for "z" and "ms" background corrections the z-scores (bigger is better). 
#'           \item "group.nobg" - aggregate scores for the whole group of sequences without background correction. For "affinity"
#'                                and "clover" the mean affinity over all sequences in the set; for "cutoff" the total number of hits in all
#'                                sequences.
#'           \item "group.bg" - aggregate scores for the whole group of sequences with background correction. For "logn" and "pval",
#'                              the P-value for the whole group (smaller is better); for "z" and "ms" the z-score for the whole set (bigger is better).
#'           \item "sequence.norm" - (only for "logn") the length-normalized scores for each of the sequences. Currently only implemented
#'                                   for "logn", where it returns the values normalized from LogN(0,1) distribution
#'           \item "group.norm" - (only for "logn") similar to sequence.norm, but for the whole group of sequences
#'         }
#' @export
#' @references \itemize{ 
#'  \item R. Stojnic & B. Adryan: Identification of functional DNA motifs using a binding affinity lognormal background distribution, submitted.
#' 	\item MC Frith et al: Detection of functional DNA motifs via statistical over-representation, Nucleid Acid Research (2004).
#' }
#' @examples 
#'
#' if(requireNamespace("PWMEnrich.Dmelanogaster.background")){
#'    ###
#'    # load the pre-compiled lognormal background
#'    data(PWMLogn.dm3.MotifDb.Dmel, package = "PWMEnrich.Dmelanogaster.background")
#'
#'    # scan two sequences for motif enrichment
#'    sequences = list(DNAString("GAAGTATCAAGTGACCAGTAGATTGAAGTAGACCAGTC"), 
#'      DNAString("AGGTAGATAGAACAGTAGGCAATGGGGGAAATTGAGAGTC"))
#'    res = motifEnrichment(sequences, PWMLogn.dm3.MotifDb.Dmel)
#'
#'    # most enriched in both sequences (lognormal background P-value)
#'    head(motifRankingForGroup(res))
#'
#'    # most enriched in both sequences (raw affinity, no background)
#'    head(motifRankingForGroup(res, bg=FALSE))
#'
#'    # most enriched in the first sequence (lognormal background P-value)
#'    head(motifRankingForSequence(res, 1))
#'
#'    # most enriched in the first sequence (raw affinity, no background)
#'    head(motifRankingForSequence(res, 1, bg=FALSE))
#'
#'    ###
#'    # Load the pre-compiled background for hit-based motif counts with cutoff of P-value = 0.001 
#'    data(PWMPvalueCutoff1e3.dm3.MotifDb.Dmel, package = "PWMEnrich.Dmelanogaster.background")
#'
#'    res.count = motifEnrichment(sequences, PWMPvalueCutoff1e3.dm3.MotifDb.Dmel)
#'
#'    # Enrichment in the whole group, z-score for the number of motif hits
#'    head(motifRankingForGroup(res))
#'
#'    # First sequence, sorted by number of motif hits with P-value < 0.001
#'    head(motifRankingForSequence(res, 1, bg=FALSE))
#'    
#' }
motifEnrichment = function(sequences, pwms, score="autodetect", bg="autodetect", cutoff=NULL, verbose=TRUE, motif.shuffles=30, B=1000,
	group.only=FALSE){
	
	# detect scoring method
	if(score == "autodetect"){
		if(inherits(pwms, "PWMLognBackground")) {
			score = "affinity"
		} else if(inherits(pwms, "PWMCutoffBackground")) {
			score = "cutoff"
		} else {
			score = "affinity"
		}
	}
	
	if(score == "cutoff" & is.null(cutoff))
		cutoff = pwms@bg.cutoff
	
	# the PWM* object
	pwmobj = NULL
	
	# detect background correction
	if(!(bg %in% c("none", "ms"))){
		if(inherits(pwms, "PWMLognBackground")) {
			bg = "logn"
			pwmobj = pwms
			pwms = pwmobj@pwms
		} else if(inherits(pwms, "PWMCutoffBackground")) {
			bg = "z"
			pwmobj = pwms
			pwms = pwmobj@pwms
		} else if(inherits(pwms, "PWMEmpiricalBackground")) {
			bg = "pval"
			pwmobj = pwms
			pwms = pwmobj@pwms
		} else if(inherits(pwms, "PWMGEVBackground")) {
			bg = "gev"
			pwmobj = pwms
			pwms = pwmobj@pwms
		} else {
			bg = "none"
		}
	}

	if(!is.list(pwms))
		pwms = list(pwms)
	
	sequences = .inputParamSequences(sequences)
	
	# return list
	res = list(score=score, bg=bg, pwms=pwms, sequences=sequences, params=list())
	
	# apply no-bg scoring
	if(score == "affinity"){
		# lengths of sequences and PWMs
		seq.len = sapply(sequences, length)
		pwm.len = sapply(pwms, length)
		
		res$sequence.nobg = motifScores(sequences, pwms, verbose=verbose)
		res$group.nobg = affinitySequenceSet(res$sequence.nobg, seq.len, pwm.len)
	} else if(score == "clover"){
		res$sequence.nobg = motifScores(sequences, pwms, verbose=verbose)
		res$group.nobg = cloverScore(res$sequence.nobg, verbose=verbose)
	} else if(score == "cutoff"){
		res$params = list(cutoff=cutoff)
		res$sequence.nobg = motifScores(sequences, pwms, cutoff=cutoff, verbose=verbose)
		res$group.nobg = colSums(res$sequence.nobg)
	} else {
		stop(paste("Unknown scoring algorithm: '", score, "'. Please select one of: 'affinity', 'cutoff', 'clover'", sep=""))
	}
	
	if(verbose){
		cat("Calculating motif enrichment scores ...\n")
	}
	
	# apply background correction if needed
	if(bg == "none"){
		res$sequence.bg = NULL
		res$group.bg = NULL
	} else if(bg == "logn"){
		# lengths of sequences and PWMs
		seq.len = sapply(sequences, length)
		pwm.len = sapply(pwms, length)
		# do it per sequence
		res$sequence.bg = logNormPval(res$sequence.nobg, seq.len, pwm.len, pwmobj@bg.mean, pwmobj@bg.sd, pwmobj@bg.len)
		colnames(res$sequence.bg) = names(pwms)
		# convert logn P-values into normalized observations and run Clover on that
		res$sequence.norm = apply(res$sequence.bg, 1:2, qlnorm, lower.tail=FALSE)
		
		if(score == "affinity"){
			# and for the group
			if(is.matrix(pwmobj@bg.mean)){
				# convert back to P-values, using the chi-sq distribution
				res$group.bg = apply(res$sequence.bg, 2, function(x) {
					pchisq(-2*sum(log(x)), 2*length(x), lower.tail=FALSE)
				})
				#res$group.bg = colMeans(log(res$sequence.bg))
				res$group.norm = sapply(res$group.bg, qlnorm, lower.tail=FALSE)
			} else {
				res$group.bg = logNormPvalSequenceSet(res$sequence.nobg, seq.len, pwm.len, pwmobj@bg.mean, pwmobj@bg.sd, pwmobj@bg.len)
				res$group.norm = sapply(res$group.bg, qlnorm, lower.tail=FALSE)
			}
			
		} else if(score == "clover"){
			# now run Clover on normalized scores
			res$group.bg = cloverScore(res$sequence.norm, verbose=verbose)
		}
	} else if(bg == "z"){
		# lengths of sequences and PWMs
		seq.len = sapply(sequences, length)
		pwm.len = sapply(pwms, length)
		res$sequence.bg = cutoffZscore(res$sequence.nobg, seq.len, pwm.len, pwmobj@bg.P)
		res$group.bg = cutoffZscoreSequenceSet(res$sequence.nobg, seq.len, pwm.len, pwmobj@bg.P)
	} else if(bg == "pval"){
		seq.len = sapply(sequences, length)
		pwm.len = sapply(pwms, length)
		
		# if to use cutoff
		usecutoff = NULL
		if(score == "cutoff")
			usecutoff = cutoff
		
		if(!group.only){
			res$sequence.bg = t(sapply(1:length(seq.len), function(i)
				 empiricalPvalue(res$sequence.nobg[i,], seq.len[i], pwm.len, pwmobj@bg.fwd, pwmobj@bg.rev, cutoff=usecutoff, B=B, verbose=verbose)))
		}
		
		# do empirical P-value
		if(score == "clover")
			res$group.bg = cloverPvalue1seq(res$sequence.nobg, seq.len, pwm.len, pwmobj@bg.fwd, pwmobj@bg.rev, B=B, verbose=verbose, clover=res$group.nobg)
		else	 
			res$group.bg = empiricalPvalueSequenceSet(res$sequence.nobg, seq.len, pwm.len, pwmobj@bg.fwd, pwmobj@bg.rev, cutoff=usecutoff, B=B, verbose=verbose)
	} else if(bg == "ms"){
		# if to use cutoff
		usecutoff = NULL
		if(score == "cutoff")
			usecutoff = cutoff
		res$sequence.bg = matrixShuffleZscorePerSequence(res$sequence.nobg, sequences, pwms, cutoff=usecutoff, B=motif.shuffles)
	} else if(bg == "gev"){
		seq.len = sapply(sequences, length)
		pwm.len = sapply(pwms, length)
		
		res$sequence.bg = gevPerSequence(res$sequence.nobg, seq.len, pwm.len, pwmobj@bg.loc, pwmobj@bg.scale, pwmobj@bg.shape)
		res$group.bg = NULL
	} else {
		stop(paste("Uknown background correction algorithm: '", bg, "', Please select one of: 'none', 'logn', 'z', 'pval', 'ms'", sep=""))
	}
	
	# only store the values for the group, not individual sequences!
	if(group.only){
		seq.n = grep("^sequence[.]", names(res))
		if(length(seq.n)>0){
			res = res[-seq.n]
		}
	}
	
	# restore the sequence names
	if("sequence.nobg" %in% names(res)){
		rownames(res$sequence.nobg) = names(sequences)
	}
	if("sequence.bg" %in% names(res)){
		rownames(res$sequence.bg) = names(sequences)
	}
	
	return(new("MotifEnrichmentResults", res=res))
		
}

#' Obtain z-score for motif column shuffling
#'
#' All PWMs are shuffled at the same time. This function would be too slow to produce
#' empirical P-values, thus we return a z-score from a small number of shuffles. 
#'
#' The z-scores are calculated for each sequence individually. 
#' 
#' @param scores a set of already calculated scores
#' @param sequences either one sequence or a list/set of sequences (objects of type DNAString or DNAStringSet)
#' @param pwms a list of PWMs
#' @param cutoff if NULL, will use affinity, otherwise will use number of hits over this log2 odds cutoff
#' @param B number of replicates, i.e. PWM column shuffles
matrixShuffleZscorePerSequence = function(scores, sequences, pwms, cutoff=NULL, B=30){
	# scores from different replicates for each of the sequences
	scores.matrix = array(0, dim=c(nrow(scores), ncol(scores), B))
	dimnames(scores.matrix) = list(rownames(scores), colnames(scores), NULL)
	
	# perfom shuffles	
	for(i in 1:B){
		cat("Perfoming shuffle", i, "/", B, "\n")
		pwms.shuffled = pwms
		for(j in 1:length(pwms)){
			p = pwms[[j]]
			pwms.shuffled[[j]]@pwm = p@pwm[, sample(ncol(p@pwm))]
		}
		
		scores.suffled = motifScores(sequences, pwms.shuffled, cutoff=cutoff, verbose=FALSE)
		scores.matrix[,,i] = scores.suffled
		
	}
	
	# work out mean and sd
	shuffled.mean = apply(scores.matrix, c(1,2), mean)
	shuffled.sd = apply(scores.matrix, c(1,2), sd)
	
	return( (scores - shuffled.mean) / shuffled.sd )
}


#' Calculate the P-value from lognormal distribution with background of equal length
#' 
#' @param scores affinity scores for the PWMs, can contain scores for more than one sequence (as rows), P-values are extracted separately
#' @param seq.len the length distribution of the sequences
#' @param pwm.len the leggths of PWMs
#' @param bg.mean the mean values from the background for PWMs
#' @param bg.sd the sd values from the background
#' @param bg.len the length distribution of the background (we currently support only constant length)
#' @param log if to produce log p-values
logNormPval = function(scores, seq.len, pwm.len, bg.mean, bg.sd, bg.len, log=FALSE){
	if(is.vector(scores)){
		scores = matrix(scores, nrow=1, dimnames=list(NULL, names(scores)))
	}
	if( length(seq.len) != nrow(scores) ){
		stop("The sequence length distribution and number of rows for scores doesn't match")
	}
	if( !all(colnames(scores) == names(bg.mean)) ){
		stop("The motifs column names in 'scores' and 'bg.mean' do not match")
	}
	
	seq.mean = seq.sd = matrix(0, nrow=nrow(scores), ncol=ncol(scores))

	slen = matrix(seq.len, nrow=nrow(scores), ncol=ncol(scores)) - 
		   matrix(pwm.len, nrow=nrow(scores), ncol=ncol(scores), byrow=TRUE) + 1

	# rows (i) - sequences
	# cols (j) - motifs
	if(is.matrix(bg.mean)){
		# human implementation ... 
		
		bg.len.min = matrix(bg.len[1,], nrow=nrow(scores), ncol=ncol(scores), byrow=TRUE)
		bg.len.max = matrix(bg.len[nrow(bg.len),], nrow=nrow(scores), ncol=ncol(scores), byrow=TRUE)
	
		bg.len.all = lapply(1:nrow(bg.len), function(i) matrix(bg.len[i,], nrow=nrow(scores), ncol=ncol(scores), byrow=TRUE))
	
		# split into three classes
		min.inx = slen <= bg.len.min
		max.inx = slen >= bg.len.max
		int.inx = !min.inx & !max.inx

		# bg.sd correction factor value
		bgsd = bg.sd * sqrt(bg.len/matrix(bg.len[1,], nrow=nrow(bg.len), ncol=ncol(bg.len), byrow=TRUE))
		bgsd = bgsd / matrix(bgsd[1,], nrow=nrow(bgsd), ncol=ncol(bgsd), byrow=TRUE)
	
		bgsd.all = lapply(1:nrow(bgsd), function(i) matrix(bgsd[i,], nrow=nrow(scores), ncol=ncol(scores), byrow=TRUE))

		# smaller
		if(any(min.inx)){
			bg.mean.min = matrix(bg.mean[1,], nrow=nrow(scores), ncol=ncol(scores), byrow=TRUE)
			bg.sd.min = matrix(bg.sd[1,], nrow=nrow(scores), ncol=ncol(scores), byrow=TRUE)

			# keep mean, use quadratic for sd
			seq.mean[min.inx] = bg.mean.min[min.inx]
			seq.sd[min.inx] = bg.sd.min[min.inx] / sqrt(slen[min.inx]/bg.len.min[min.inx])
		} 
		if(any(max.inx)){
			bg.mean.max = matrix(bg.mean[nrow(bg.mean),], nrow=nrow(scores), ncol=ncol(scores), byrow=TRUE)
			bg.sd.max = matrix(bg.sd[nrow(bg.sd),], nrow=nrow(scores), ncol=ncol(scores), byrow=TRUE)

			# keep mean, use quadratic for sd
			seq.mean[max.inx] = bg.mean.max[max.inx]
			seq.sd[max.inx] = bg.sd.max[max.inx] / sqrt(slen[max.inx]/bg.len.max[max.inx])
		}
		if(any(int.inx)){
			bg.mean.all = lapply(1:nrow(bg.mean), function(i) matrix(bg.mean[i,], nrow=nrow(scores), ncol=ncol(scores), byrow=TRUE))
			bg.sd.all = lapply(1:nrow(bg.sd), function(i) matrix(bg.sd[i,], nrow=nrow(scores), ncol=ncol(scores), byrow=TRUE))
			# interpolate!		
			for(k in 1:(nrow(bg.mean)-1)){
				a = bg.len.all[[k]]
				b = bg.len.all[[k+1]]
				inx = int.inx & slen >= a & slen < b
				if(any(inx)){
					# direct linear interpolation for mean
					seq.mean[inx] = bg.mean.all[[k]][inx] + (bg.mean.all[[k+1]][inx] - bg.mean.all[[k]][inx])/(b[inx]-a[inx]) * (slen[inx]-a[inx])
				
					# base value of seq.sd
					seq.sd[inx] = bg.sd.all[[1]][inx] / sqrt(slen[inx]/bg.len.all[[1]][inx])
				
					# this correction factor would ideally be 1
					seq.sd[inx] = seq.sd[inx] * (bgsd.all[[k]][inx] + (bgsd.all[[k+1]][inx]-bgsd.all[[k]][inx])/(b[inx]-a[inx]) * (slen[inx]-a[inx]))
				
				}
			}
		}
	} else {
		# default implementation
		seq.mean = matrix(bg.mean, nrow=nrow(scores), ncol=ncol(scores), byrow=TRUE)
		seq.sd = matrix(bg.sd, nrow=nrow(scores), ncol=ncol(scores), byrow=TRUE) / 
					sqrt(slen / matrix(bg.len, nrow=nrow(scores), ncol=ncol(scores), byrow=TRUE))
	}
	
	# calculate p-values
	seq.p = matrix(0, nrow=nrow(scores), ncol=ncol(scores))
	colnames(seq.p) = colnames(scores)
	rownames(seq.p) = rownames(scores)
	
	# calculate the mean/sd parameters of the lognormal distribution
	seq.ml = 2*log(seq.mean) - 0.5*log(seq.mean^2+seq.sd^2)
	seq.sl = sqrt(-2*log(seq.mean) + log(seq.mean^2+seq.sd^2))
	
	# convert to z-scores so we can streamline... 
	z = (log(scores) - seq.ml) / seq.sl
	seq.p[,] = plnorm(exp(z), meanlog=0, sdlog=1, lower.tail=FALSE, log.p=log)
	
	# convert to log10 P-values
	if(log)
		seq.p = seq.p * log10(exp(1))
		
	seq.p
}

#' Lognormal P-value for a set of sequences
#'
#' @param scores a matrix of per-sequence affinity scores
#' @param seq.len lengths of sequences
#' @param pwm.len lengths of pwms
#' @param bg.mean mean background at length of bg.len
#' @param bg.sd standard deviation of background at length of bg.len
#' @param bg.len the length for which mean and sd are calculated
#' @return P-value
logNormPvalSequenceSet = function(scores, seq.len, pwm.len, bg.mean, bg.sd, bg.len){
	# total score for all the sequences
	s = structure(rep(0, ncol(scores)), names=colnames(scores))
	res = structure(rep(0, ncol(scores)), names=colnames(scores))
	for(i in 1:ncol(scores)){
		# lengths of sequences for this pwm, all of them are shorter by pwm.len-1
		seq.len.pwm = seq.len - pwm.len[i] + 1
		
		# sum over sequences then average by total length
		s[i] = sum(scores[,i] * seq.len.pwm) / sum(seq.len.pwm)
		if(is.matrix(bg.mean)){
			res[i] = logNormPval(s[i], sum(seq.len.pwm)+pwm.len[i]-1, pwm.len[i], bg.mean[,i,drop=FALSE], bg.sd[,i,drop=FALSE], bg.len[,i,drop=FALSE])
		} else {
			res[i] = logNormPval(s[i], sum(seq.len.pwm)+pwm.len[i]-1, pwm.len[i], bg.mean[i], bg.sd[i], bg.len[i])
		}
	}
	
 	return(res)
}


#' Replace all infinite values by 0
#'
#' @param x a vector of values
keepFinite = function(x) {
	if(any(!is.finite(x)))
		x[!is.finite(x)] = 0
	x
}

#' Z-score calculation for cutoff hits
#'
#' The Z-score is calculated separately for each sequence
#'
#' @param scores the hit counts for the sequences
#' @param seq.len the length distribution of sequences
#' @param pwm.len the length distribution of the PWMs
#' @param bg.P background probabilities of observing a motif hit at nucleotide resolution 
#'             (scaled to sequence length, not 2 * length)
#' @return Z-score
cutoffZscore = function(scores, seq.len, pwm.len, bg.P){
	if(is.vector(scores))
		scores = matrix(scores, nrow=1)

	# actual probability is 1/2 because we have two strands
	bg.P = bg.P / 2
	
	z = matrix(0, nrow=nrow(scores), ncol=ncol(scores))
	rownames(z) = rownames(scores)
	colnames(z) = colnames(scores)
	for(i in 1:nrow(scores)){
		# actualy length is 2x because we have two strands
		P.n = 2 * (seq.len[i] - pwm.len + 1)
		P.mean = bg.P * P.n
		P.sd = sqrt(P.n * bg.P * (1-bg.P))
		# so this would be the poisson approximation
		# z[i,] = 1 / ppois(scores[i,], P.mean, lower.tail=FALSE)
		z[i,] = keepFinite( (scores[i,] - P.mean) / P.sd )
	}
	
	return(z)
}

#' Z-score calculation for cutoff hits for group of sequences
#'
#' The Z-score is calculated as if the sequence came for one very long sequence
#'
#' @param scores the hit counts for the sequences
#' @param seq.len the length distribution of sequences
#' @param pwm.len the length distribution of the PWMs
#' @param bg.P background probabilities of observing a motif hit at nucleotide resolution
#' @return Z-score
cutoffZscoreSequenceSet = function(scores, seq.len, pwm.len, bg.P){	
	total.hits = colSums(scores)
	total.len = sum(seq.len) - length(seq.len)*(pwm.len-1) + pwm.len - 1
	
	# do it seperate for each PWM to make sure the lengths are right
	res = sapply(1:length(total.hits), function(i) cutoffZscore(total.hits[i], total.len[i], pwm.len[i], bg.P[i]))
	names(res) = names(total.hits)
	
	return(res)
}

#' Calculate the empirical P-value by affinity of cutoff. 
#'
#' This is the new backend function for empirical P-values for either affinity or cutoff. 
#' The function only works on single sequences. 
#'
#' @param scores the scores obtained for the sequence
#' @param seq.len the length of the sequence, if a single value will take a single sequence
#'                of given length. If a vector of values, will take sequences of given lengths
#'                and joint them together
#' @param pwm.len the lengths of PWMs
#' @param bg.fwd raw odds scores for the forward strand of background
#' @param bg.rev raw odds scores for the reverse strand of background
#' @param cutoff if not NULL, will use hit count above this cutoff. The cutoff should be specified in log2. 
#' @param B the number of random replicates
#' @param verbose if to give verbose progress reports
#' @param exact.length if to take into consideration that the actual sequence lengths differ for different PWMs.
#'                     For very long sequences (i.e. seq.len >> pwm.len) this make very little difference, however
#'                     the run time with exact.length is much longer. 
empiricalPvalue = function(scores, seq.len, pwm.len, bg.fwd, bg.rev, cutoff=NULL, B=10000, verbose=FALSE, exact.length=FALSE){
	# length of sequences
	bg.len = nrow(bg.fwd)
	
	if(max(seq.len) > bg.len){
		stop("The length of sequence greater than background!")
	} 
	
	if(max(seq.len) * B > bg.len){
		warning("The length of the sequence multiplied by number of runs (B) is greater than background length. This might lead to unreliable P-value estimates.")
	}
		
	# the empricial p-value
	score.p = rep(0, length(scores))
		
	for(i in 1:B){
		if(verbose)
			message("Random sample ", i, " / ", B)
		
		# calculate the final score as sum over different seq.len sequences
		final.score = 0
		for(k in 1:length(seq.len)){
			# choose a subsequence with matching length
			start.range = 1:(bg.len - seq.len[k])
			start = sample(start.range, 1)
		
			# fetch the scores from the two strands
			score.fwd = bg.fwd[start:(start+seq.len[k]-1),]
			score.rev = bg.rev[start:(start+seq.len[k]-1),]

			# simulate that we don't have scores for last pwm.len+1 nucleotides
			if(exact.length){
				for(j in 1:length(pwm.len)){
					score.fwd[(nrow(score.fwd)-pwm.len[j]+1):nrow(score.fwd),j] = NA
					score.rev[(nrow(score.rev)-pwm.len[j]+1):nrow(score.rev),j] = NA
				}
			}
			
			# append both strands to one long string		
			score.both = rbind(score.fwd, score.rev)
		
			if(is.null(cutoff)){
				# affinity algorithm
				new.score = colSums(score.both, na.rm=TRUE)
			} else {
				# number of hits above threshold
				new.score = colSums(score.both >= 2^cutoff, na.rm=TRUE)
			}
			final.score = final.score + new.score
		}
		
		# to obtain the mean we divide by twice the length (because it's both strands)
		final.score = final.score / (2 * sum(seq.len)) 
		
		score.p = score.p + ((final.score > scores) + 0)
		
	}
	
	return(score.p / B)
}

#' Empirical P-value for a set of sequences
#'
#' Calculate empirical P-value for a set of sequences, using either affinity or cutoff. When cutoff is used, the score
#' is a number of motif hits above a certain log-odds cutoff. 
#'
#' @param scores a matrix of scores, rows for sequences, columns for PWMs
#' @param seq.len the lengths of sequences
#' @param pwm.len the lengths of PWMs
#' @param bg.fwd raw odds scores for the forward strand of background
#' @param bg.rev raw odds scores for the reverse strand of background
#' @param cutoff if not NULL, will use hit count above this cutoff. The cutoff should be specified in log2. 
#' @param B the number of random replicates
#' @param verbose if to give verbose progress reports
empiricalPvalueSequenceSet = function(scores, seq.len, pwm.len, bg.fwd, bg.rev, cutoff=NULL, B=10000, verbose=FALSE){
	# affinity
	if(is.null(cutoff)){
		# total score for all the sequences
		s = structure(rep(0, ncol(scores)), names=colnames(scores))
		res = structure(rep(0, ncol(scores)), names=colnames(scores))
		exact.lengths = structure(rep(0, ncol(scores)), names=colnames(scores))
		for(i in 1:ncol(scores)){
			# lengths of sequences for this pwm, all of them are shorter by pwm.len-1
			seq.len.pwm = seq.len - pwm.len[i] + 1
			exact.lengths[i] = sum( seq.len.pwm )
		
			# sum over sequences then average by total length
			s[i] = sum(scores[,i] * seq.len.pwm) / sum(seq.len.pwm)			
		}
		
		# approximate sequence length without all the last positions being matched
		total.len = seq.len - round(mean(pwm.len))
		
		# run it on mean sequence length
		res = empiricalPvalue(s, total.len, pwm.len, bg.fwd, bg.rev, cutoff, B, verbose, exact.length=FALSE)
	
	 	return(res)
	} else {
		total.hits = colSums(scores)
		total.len = seq.len - round(mean(pwm.len)) #sum(seq.len) - length(seq.len)*pwm.len
	
		# run ti on mean sequence lengths (with pwm.len)
		res = empiricalPvalue(total.hits, total.len, pwm.len, 
					bg.fwd, bg.rev, cutoff, B, verbose, exact.length=FALSE)
					
		return(res)
	}
}

#' Apply GEV background normalization per every sequence
#' 
#' @param scores affinity scores for the PWMs, can contain scores for more than one sequence (as rows), P-values are extracted separately
#' @param seq.len the length distribution of the sequences
#' @param pwm.len the lengths of PWMs
#' @param bg.loc list of linear regression for location parameter
#' @param bg.scale list of linear regression for scale parameter
#' @param bg.shape list of linear regression for shape parameter
gevPerSequence = function(scores, seq.len, pwm.len, bg.loc, bg.scale, bg.shape){
	if(is.vector(scores)){
		scores = matrix(scores, nrow=1, dimnames=list(NULL, names(scores)))
	}
	if( length(seq.len) != nrow(scores) ){
		stop("The sequence length distribution and number of rows for scores doesn't match")
	}
	
	# calculate the expected sd values for the length distribution for each sequence
	seq.sd = matrix(0, nrow=nrow(scores), ncol=ncol(scores))
	
	log.seqlen = data.frame("log.len"=log(seq.len))
	
	# predict loc, scale, shape for each PWM
	loc = matrix(0, nrow=nrow(scores), ncol=ncol(scores))
	scale = matrix(0, nrow=nrow(scores), ncol=ncol(scores))
	shape = matrix(0, nrow=nrow(scores), ncol=ncol(scores))
	for(i in 1:ncol(scores)){
		loc[,i] = as.vector(predict.lm(bg.loc[[i]], newdata=log.seqlen))
		scale[,i] = as.vector(predict.lm(bg.scale[[i]], newdata=log.seqlen))
		shape[,i] = as.vector(predict.lm(bg.shape[[i]], newdata=log.seqlen))
	}
	
	## work out the P-values
	seq.p = matrix(0, nrow=nrow(scores), ncol=ncol(scores))
	colnames(seq.p) = colnames(scores)
	rownames(seq.p) = rownames(scores)
	
	for(i in 1:nrow(seq.p)){
		for(j in 1:ncol(seq.p)){
			# if scale is <0 the GEV approximation is no longer valid
			if(scale[i,j] < 0)
				seq.p[i,j] = NA
			else
				seq.p[i,j] = pgev(log(scores[i,j]), loc=loc[i,j], scale=scale[i,j], shape=shape[i,j], lower.tail=FALSE)
		}
	}
	
	seq.p
}

#' Calculate total affinity over a set of sequences
#'
#' @param scores affinity scores for individual sequences
#' @param seq.len lengths of sequences
#' @param pwm.len lengths of PWMs
affinitySequenceSet = function(scores, seq.len, pwm.len){
	if(is.vector(scores))
		scores = matrix(scores, nrow=1, dimnames=list(NULL, names(scores)))

	# final scores for the group
	final = structure(rep(0, ncol(scores)), names=colnames(scores))
	
	for(i in 1:ncol(scores)){
		# actual length of scanned sequence
		seq.len.pwm = seq.len - pwm.len[i] + 1
		
		final[i] = sum(scores[,i] * seq.len.pwm) / sum(seq.len.pwm)
	}
	
	return(final)
}

#' Calculate Recovery-AUC for motifs ranked according to some scoring scheme
#'
#' Note that this function asssumes that smaller values are better!
#' 
#' @param seq.res a matrix where each column represents a PWM and each row a result for a different sequence. 
motifRecoveryAUC = function(seq.res){
	# calculate the ranks
	r = t(apply(seq.res, 1, rank))
	
	# previous point needed to calculate AUC-ROC
	auc = rep(0, ncol(r))
	names(auc) = colnames(r)
	
	# do it for all whole steps
	rec = t(sapply(1:ncol(r), function(i) colSums(r<=i)))
	
	list("rec"=rec, "auc"=colSums(rec))
}

#' Calculate PR-AUC for motifs ranked according to some scoring scheme
#'
#' Note that this function asssumes that smaller values are better!
#' 
#' @param seq.res a matrix where each column represents a PWM and each row a result for a different sequence. 
motifPrAUC = function(seq.res){
	# calculate the ranks
	r = t(apply(seq.res, 1, rank))
	
	# previous point needed to calculate AUC-ROC
	auc = rep(0, ncol(r))
	names(auc) = colnames(r)
	
	# do it for all whole steps
	rec = t(sapply(1:ncol(r), function(i) colSums(r<=i)))
	
	recall = rec / nrow(r)
	prec = recall / 1:nrow(rec)
	
	list("prec"=prec, "recall"=recall)
}

Try the PWMEnrich package in your browser

Any scripts or data that you put into this service are public.

PWMEnrich documentation built on Nov. 8, 2020, 7:45 p.m.