R/scoringMatrix.R

Defines functions normargPriorParams EScore normarg_scoringMat

### =================================================================
### pairwise genome alignment stuff
### -----------------------------------------------------------------

### -----------------------------------------------------------------
### Scoring matrix used in lastz from http://genomewiki.ucsc.edu/index.php/Hg19_100way_conservation_lastz_parameters.
### NOT Exported yet!

## This is the scoring matrix for highly divergent species.
HOXD55_MATRIX <- matrix(c(91, -90, -25, -100,
                          -90, 100, -100, -25,
                          -25, -100, 100, -90,
                          -100, -25, -90, 91),
                        nrow=4, ncol=4,
                        dimnames=list(c("A", "C", "G", "T"),
                                      c("A", "C", "G", "T"))
                        )

## This is the default medium scoring matrix.
HOXD70_MATRIX <- matrix(c(91, -114, -31, -123,
                          -114, 100, -125, -31,
                          -31, -125, 100,-114,
                          -123, -31, -114, 91),
                        nrow=4, ncol=4,
                        dimnames=list(c("A", "C", "G", "T"),
                                      c("A", "C", "G", "T"))
                        )

## This is the scoring matrix for close species.
HVSC_MATRIX <- matrix(c(90, -330, -236, -356,
                        -330, 100, -318, -236,
                        -236, -318, 100, -330,
                        -356, -236, -330, 90),
                      nrow=4, ncol=4,
                      dimnames=list(c("A", "C", "G", "T"),
                                    c("A", "C", "G", "T"))
                      )

### -----------------------------------------------------------------
### scoring matrix validator
### Not Exported!
normarg_scoringMat <- function(scoringMat){
  if(!is.matrix(scoringMat) || !is.numeric(scoringMat))
    stop("'", deparse(substitute(scoringMat)), "' must be a numeric matrix")
  if(!identical(rownames(scoringMat), DNA_BASES) || 
     !identical(colnames(scoringMat), DNA_BASES))
    stop("' rownames(", deparse(substitute(scoringMat)), ")' and 'colnames(", 
         deparse(substitute(scoringMat)),
         ")' must be the 4 DNA bases ('DNA_BASES')")
  if(any(is.na(scoringMat)))
    stop("'", deparse(substitute(scoringMat)), "' contains NAs")
  if(!isSymmetric(scoringMat))
    stop("'", deparse(substitute(scoringMat)), "' must be symmetric")
  return(scoringMat)
}

### -----------------------------------------------------------------
### calculate the threshold score based on window size, identity and 
### scoring matrix
### NOT Exported yet!
## This function tries to be compatible with the minScore_winSize thresholds
## used in previous CNEs identification.
## This function will output the equivilent threshold with a scoring matrix.
EScore <- function(scoringMat, winSize, minScore, 
                   ## NOT FINISHED!
                   bg=c(A=0.25, C=0.25, G=0.25, T=0.25),
                   gapOpen=NULL, gapExt=NULL){
  bg <- normargPriorParams(bg)
  scoringMat <- normarg_scoringMat(scoringMat)
  ans <- 0
  ## first calculate the scores from matches
  for(base in names(bg)){
    ans <- ans + minScore * bg[base] * scoringMat[base, base]
  }
  ## second calculate the scores from 

}

### Typical 'prior.params' vector: c(A=0.25, C=0.25, G=0.25, T=0.25)
### This is taken from Biostrings package.
normargPriorParams <- function(prior.params)
{
    if (!is.numeric(prior.params))
        stop("'prior.params' must be a numeric vector")
    if (length(prior.params) != length(DNA_BASES) ||
        !setequal(names(prior.params), DNA_BASES))
        stop("'prior.params' elements must be named A, C, G and T")
    ## Re-order the elements.
    prior.params <- prior.params[DNA_BASES]
    if (any(is.na(prior.params)) || any(prior.params < 0))
        stop("'prior.params' contains NAs and/or negative values")
    prior.params
}
ge11232002/CNEr documentation built on Oct. 26, 2022, 7:08 p.m.