R/523-extractProtPSSMAcc.R

Defines functions extrProtPSSMAcc

Documented in extrProtPSSMAcc

#' Profile-based protein representation derived by PSSM 
#' (Position-Specific Scoring Matrix) and auto cross covariance
#'
#' Profile-based protein representation derived by PSSM 
#' (Position-Specific Scoring Matrix) and auto cross covariance
#'
#' This function calculates the feature vector based on the PSSM 
#' by running PSI-Blast and auto cross covariance tranformation.
#'
#' @param pssmmat The PSSM computed by \code{\link{extrProtPSSM}}.
#' @param lag The lag parameter. Must be less than the number of amino acids 
#' in the sequence (i.e. the number of columns in the PSSM matrix).
#'
#' @return A length \code{lag * 20^2} named numeric vector,
#' the element names are derived by the amino acid name abbreviation
#' (crossed amino acid name abbreviation) and lag index.
#'
#' @seealso \link{extrProtPSSM} \link{extrProtPSSMFeature}
#'
#' @keywords extract PSSM Blast Alignment
#'
#' @aliases extrProtPSSMAcc
#'
#' @author Min-feng Zhu <\email{wind2zhu@@163.com}>, 
#'         Nan Xiao <\url{http://r2s.name}>
#'
#' @export extrProtPSSMAcc
#'
#' @references
#' Wold, S., Jonsson, J., Sj\"{o}rstr\"{o}m, M., Sandberg, 
#' M., & R\"{a}nnar, S. (1993). 
#' DNA and peptide sequences and chemical processes multivariately modelled 
#' by principal component analysis and partial least-squares projections 
#' to latent structures. 
#' \emph{Analytica chimica acta}, 277(2), 239--253.
#' 
#' @examples
#' \donttest{
#' x = readFASTA(system.file('protseq/P00750.fasta', package = 'BioMedR'))[[1]]
#' dbpath = tempfile('tempdb', fileext = '.fasta')
#' invisible(file.copy(from = system.file('protseq/Plasminogen.fasta', package = 'BioMedR'), to = dbpath))
#' pssmmat = extrProtPSSM(seq = x, database.path = dbpath)
#' pssmacc = extrProtPSSMAcc(pssmmat, lag = 3)
#' tail(pssmacc)
#' }
#'

extrProtPSSMAcc = function(pssmmat, lag) {

    # Normalize PSSM scores to (0, 1)
    mat = 1/(1 + exp(pssmmat))

    accpssm = function (mat, lag) {

        p = nrow(mat)
        n = ncol(mat)

        if (lag > n) stop('lag must be smaller than the amino acids in the sequence')

        # auto covariance: p elements

        acc1 = matrix(0, nrow = p, ncol = lag)

        for (j in 1:p) {
            for (i in 1:lag) {
                acc1[j, i] = sum(mat[j, 1:(n - i)] * mat[j, ((1:(n - i)) + i)]) / (n - i)
            }
        }

        acc1 = as.vector(acc1)

        AADict = c('A', 'R', 'N', 'D', 'C', 'Q', 'E', 'G', 'H', 'I', 
                   'L', 'K', 'M', 'F', 'P', 'S', 'T', 'W', 'Y', 'V')
        names(acc1) = as.vector(outer(AADict, paste0('lag', 1:lag), 
                                      paste, sep = '.'))

        # auto cross covariance: p^2 - p elements

        acc2 = matrix(0, nrow = p^2 - p, ncol = lag)

        idx = cbind(combn(1:p, 2), combn(1:p, 2)[2:1, ])

        for (j in 1:ncol(idx)) {
            for (i in 1:lag) {
                acc2[j, i] = sum(mat[idx[1, j], 1:(n - i)] * mat[idx[2, j], ((1:(n - i)) + i)]) / (n - i)
            }
        }

        acc2 = as.vector(acc2)
        names(acc2) = as.vector(outer(paste(AADict[idx[1, ]], AADict[idx[2, ]], 
                                            sep = '.'), 
                                      paste0('lag', 1:lag), paste, sep = '.'))

        ACC = c(acc1, acc2)

        return(ACC)

    }

    result = accpssm(mat, lag)

    return(result)

}

Try the BioMedR package in your browser

Any scripts or data that you put into this service are public.

BioMedR documentation built on Nov. 17, 2017, 10:08 a.m.