R/motif_analysis.R
In atSNP: Affinity test for identifying regulatory SNPs

Documented in ComputeMotifScore ComputePValues GetIUPACSequence LoadFastaData LoadMotifLibrary LoadSNPData MatchSubsequence

#' @name LoadMotifLibrary
#' @title Load position weight matrices.
#' @description Load the file for position weight matrices for motifs.
#' @param filename a MEME format file name.
#' @param urlname URL containing a MEME format file.
#' @param tag A string that marks the description line of the position weight 
#' matrix.
#' @param skiprows Number of description lines before each position weight 
#' matrix.
#' @param skipcols Number of columns to be skipped in the position weight 
#' matrix.
#' @param transpose If TRUE (default), then the position weight matrix should 
#' have 4 columns. Otherwise, it should have 4 rows.
#' @param field The index of the field in the description line, seperated by 
#' space, that indicates the motif name.
#' @param sep A vector of chars for the string separators to parse each lines of
#'  the matrix. Default: c(" ", "\\t").
#' @param pseudocount An integer for the pseudocount added to each of the 
#' original matrices. Default: 0. Recommended to be 1 if the original matrices 
#' are position frequency matrices.
#' @details This function reads the formatted file containing motif information 
#' and convert them into a list of position weight matrices. The list of 
#' arguments should provide enough flexibility of importing a varying number of 
#' formats. Some examples are the following:
#' For MEME format, the suggested arguments are: tag = 'Motif', skiprows = 2, 
#' skipcols = 0, transpose = FALSE, field = 2, sep = ' ';
#' For motif files from JOHNSON lab (i.e. 
#' http://johnsonlab.ucsf.edu/mochi_files/JASPAR_motifs_H_sapiens.txt), 
#' the suggested arguments are: tag = '/NAME', skiprows = 1, skipcols = 0, 
#' transpose = FALSE, field = 2, sep = "\\t";
#' For JASPAR pfm matrices (i.e. http://jaspar.genereg.net/download/CORE/JASPAR
#' 2018_CORE_vertebrates_non-redundant_pfms_jaspar.txt), the suggested arguments
#' are: tag = ">", skiprows = 1, skipcols = 0, transpose = TRUE, field = 1, 
#' sep = "\\t"; For the TRANSFAC library provided by UCF bioinformatics groups 
#' (i.e. http://gibbs.biomed.ucf.edu/PreDREM/download/nonredundantmotif.transfac
#' ), the suggested arguments are: tag = "DE", skiprows = 1, skipcols = 1, 
#' transpose = FALSE, field = 2, sep = "\\t".
#' @return A list object of position weight matrices.
#' @author Sunyoung Shin \email{sunyoung.shin@@utdallas.edu}, Chandler Zuo 
#' \email{chandler.c.zuo@@gmail.com}
#' @examples
#' pwms <- LoadMotifLibrary(
#' urlname="http://pages.stat.wisc.edu/~keles/atSNP-Data/pfm_vertebrates.txt", 
#' tag = ">", transpose = FALSE, field = 1, sep = c("\t", " ", ">"), 
#' skipcols = 1, skiprows = 1, pseudocount = 1)
#' @useDynLib atSNP
#' @import BiocFileCache
#' @import rappdirs
#' @export
LoadMotifLibrary <- function(filename = NULL, urlname = NULL, tag = "MOTIF", transpose = FALSE, field = 2, sep = c("\t", " "), skipcols = 0, skiprows = 2, pseudocount = 0) {
  if ( is.null(filename) & is.null(urlname) )  {
    stop("one argument among 'filename' and 'urlname' should be provided.")
  } else {
    if(!is.null(filename) & !is.null(urlname))
      stop("only one argument among 'filename' and 'urlname' should be provided.")
  }
  if(is.null(filename)) {
    bfc <- BiocFileCache(cache = user_cache_dir(appname = "BiocFileCache"), ask = FALSE)
    rid <- bfcrid(bfcquery(bfc, query=basename(urlname), exact=TRUE, field="rname"))
    if (!length(rid))
      rid <- names(bfcadd(bfc, rname=basename(urlname), urlname))
    lines<-readLines(bfcrpath(rids=rid))
  } else {
    lines<-readLines(filename)
  }

    motifLineNums <- grep(tag, lines)
  if(length(myStrSplit(lines[motifLineNums[1]], sep)[[1]]) >= field) {
    motifnames <-
      sapply(myStrSplit(lines[motifLineNums], sep), function(x) x[field])
  } else {
    motifnames <-
      sapply(myStrSplit(lines[motifLineNums], sep), function(x) x[field])
  }
  allmats <- as.list(seq_along(motifnames))

  for(matrixId in seq_along(motifLineNums)) {
    motifLineNum <- motifLineNums[matrixId] + skiprows
    if(!transpose) {
      pwm <- NULL
      nrows <- 0
      tmp <- myStrSplit(lines[nrows + motifLineNum], split = sep)[[1]]
      tmp <- tmp[nchar(tmp) > 0]
      while(length(tmp) >= 4 + skipcols) {
        tmp <- as.numeric(tmp[skipcols + seq(4)])
        if(sum(is.na(tmp)) == 0) {
          pwm <- rbind(pwm, tmp)
        }
        nrows <- nrows + 1
        tmp <- myStrSplit(lines[nrows + motifLineNum], split = sep)[[1]]
        tmp <- tmp[nchar(tmp) > 0]
      }
    } else {
      nrows <- 4
      if(skipcols == 0) {
        pwm <-
          matrix(as.numeric(unlist(myStrSplit(lines[seq(nrows) + motifLineNum - 1], split = sep))), ncol = 4)
      } else {
        pwm <-
          matrix(as.numeric(unlist(sapply(myStrSplit(lines[seq(nrows) + motifLineNum - 1], split = sep), function(x) x[-seq(skipcols)]))), ncol = 4)
      }
    }
    pwm <- pwm + pseudocount
    pwm <- pwm / apply(pwm, 1, sum)
    pwm <- t(apply(pwm, 1,
                   function(x) {
                     x[x < 1e-10] <- 1e-10 / (1 - 1e-10 * sum(x < 1e-10)) * sum(x)
                     return(x / sum(x))
                   }))
    rownames(pwm) <- NULL
    allmats[[matrixId]] <- pwm
  }
  names(allmats) <- motifnames
  return(allmats)
}


#' @name LoadSNPData
#' @title Load the SNP information and code the genome sequences around the SNP 
#' locations.
#' @description Load the SNP data.
#' @param filename A table containing the SNP information. Must contain at least
#'  five columns with exactly the following names:
#' \tabular{ll}{
#' chr \tab chromosome.\cr
#' snp \tab The nucleotide position of the SNP.\cr
#' snpid \tab The names of the SNPs.\cr
#' a1 \tab The deoxyribose for one allele.\cr
#' a2 \tab The deoxyribose for the other allele.\cr
#' }
#' If this file exists already, it is used to extract the SNP information. 
#' Otherwise, SNP information extracted using argument 'snpids' is outputted to 
#' this file.
#' @param snpids A vector of rs ids for the SNPs. This argument is overidden 
#' if the file with name \code{filename} exists.
#' @param snp.lib A string of the library name to obtain the SNP information 
#' based on rs ids. Default: "SNPlocs.Hsapiens.dbSNP144.GRCh38".
#' @param genome.lib A string of the library name for the genome version. 
#' Default: "BSgenome.Hsapiens.UCSC.hg38".
#' @param half.window.size An integer for the half window size around the SNP 
#' within which the motifs are matched. Default: 30.
#' @param default.par A boolean for whether using the default Markov parameters.
#'  Default: FALSE.
#' @param mutation A boolean for whether this is mutation data. See details for 
#' more information. Default: FALSE.
#' @param ... Other parameters passed to \code{\link[utils]{read.table}}.
#' @details This function extracts the nucleotide sequence within a window 
#' around each SNP and code them using 1-A, 2-C, 3-G, 4-T.\cr
#' There are two ways of obtaining the nucleotide sequences. If \code{filename} 
#' is not NULL and the file exists, it should contain the positions and alleles 
#' for each SNP. Based on such information, the sequences around SNP positions 
#' are extracted using the Bioconductor annotation package specified by 
#' \code{genome.lib}. Users should make sure that this annotation package 
#' corresponds to the correct species and genome version of the actual data. 
#' Alternatively, users can also provide a vector of rs ids via the argument 
#' \code{snpids}. The SNP locations and allele information is then obtained via 
#' the Bioconductor annotation package specified by \code{snp.lib}, and passed 
#' on to the package specified by \code{genome.lib} to further obtain the 
#' nucleotide sequences.\cr
#' If \code{mutation=FALSE} (default), this function assumes that the data is 
#' for SNP analysis, and the reference genome should be consistent with either 
#' the a1 or a2 nucleotide. When extracting the genome sequence around each SNP 
#' position, this function compares the nucleotide at the SNP location on the 
#' reference genome with both a1 and a2 to distinguish between the reference 
#' allele and the SNP allele. If the nucleotide extracted from the reference 
#' genome does not match either a1 or a2, the SNP is discarded. The discarded 
#' SNPs are in the 'rsid.rm' field in the output.\cr
#' Alternatively, if \code{mutation=TRUE}, this function assumes that the data 
#' is for general single nucleotide mutation analysis. After extracting the 
#' genome sequence around each SNP position, it replaces the nucleotide at the 
#' SNP location by the a1 nucleotide as the 'reference' allele sequence, and by 
#' the a2 nucleotide as the 'snp' allele sequence. It does NOT discard the 
#' sequence even if neither a1 or a2 matches the reference genome. When this 
#' data set is used in other functions, such as \code{\link{ComputeMotifScore}},
#'  \code{\link{ComputePValues}}, all the results (i.e. affinity scores and 
#'  their p-values) for the reference allele are indeed for the a1 allele, and 
#'  results for the SNP allele are indeed for the a2 allele.\cr
#' If the input is a list of rsid's, the SNP information extracted from 
#' \code{snp.lib} may contain more than two alleles for a single location. For 
#' such cases, \code{\link{LoadSNPData}} first extracts all pairs of alleles 
#' associated with those locations. If 'mutation=TRUE', all those pairs are 
#' considered as pairs of reference and SNP alleles, and their information is 
#' contained in 'sequence_matrix', 'a1', 'a2' and 'snpid'. If 'mutation=FALSE', 
#' \code{\link{LoadSNPData}} further filters these pairs based on whether one 
#' allele matches to the reference genome nucleotide extracted from 
#' \code{genome.lib}. Only those pairs with one allele matching the reference 
#' genome nucleotide is considered as pairs of reference and SNP alleles, with 
#' their information contained in 'sequence_matrix', 'a1', 'a2' and 'snpid'.\cr
#' @return A list object containing the following components:
#' \tabular{ll}{
#' sequence_matrix \tab A list of integer vectors representing the deroxyribose 
#' sequence around each SNP.\cr
#' a1 \tab An integer vector for the deroxyribose at the SNP location on the 
#' reference genome.\cr
#' a2 \tab An integer vector for the deroxyribose at the SNP location on the 
#' SNP genome.\cr
#' snpid \tab A string vector for the SNP rsids.\cr
#' rsid.missing \tab If the data source is a list of rsids, this field records 
#' rsids for SNPs that are discarded because they are not in the SNPlocs package.\cr
#' rsid.duplicate \tab If the data source is a list of rsids, this field records
#'  rsids for SNPs that based on the SNPlocs package, this locus has more than 
#'  2 alleles. \cr
#' rsid.na \tab This field records rsids for SNPs that are discarded because the
#'  nucleotide sequences contain none ACGT characters.\cr
#' rsid.rm \tab If the data source is a table and \code{mutation=FALSE}, this 
#' field records rsids for SNPs that are discarded because the nucleotide on the
#'  reference genome matches neither 'a1' or 'a2' in the data source.\cr
#' }
#' The results are coded as: "A"-1, "C"-2, "G"-3, "T"-4.
#' @author Chandler Zuo \email{chandler.c.zuo@@gmail.com}
#' @examples
#' \dontrun{LoadSNPData(snpids = c("rs53576", "rs7412"), 
#' genome.lib ="BSgenome.Hsapiens.UCSC.hg38", snp.lib = 
#' "SNPlocs.Hsapiens.dbSNP144.GRCh38", half.window.size = 30, default.par = TRUE
#' , mutation = FALSE)}
#' @import BSgenome
#' @useDynLib atSNP
#' @export
LoadSNPData <- function(filename = NULL, genome.lib = "BSgenome.Hsapiens.UCSC.hg38",
                        snp.lib = "SNPlocs.Hsapiens.dbSNP144.GRCh38",
                        snpids = NULL, half.window.size = 30, default.par = FALSE,
                        mutation = FALSE, ...) {
  useFile <- FALSE
  rsid.rm <- rsid.missing <- rsid.duplicate <- rsid.na <- NULL
  if(!is.null(filename)) {
    if(file.exists(filename)) {
      useFile <- TRUE
    }
  }
  if(useFile) {
    if(!is.null(snpids)) {
      message("Warning: load SNP information from 'filename' only. The argument 'snpids' is overridden.")
    }
    tbl <- read.table(filename, header = TRUE, stringsAsFactors = FALSE,  colClasses = c("character"), ...)
    tbl<-tbl[c("snpid", "chr", "snp", "a1", "a2")]
    tbl$snp<-as.integer(tbl$snp)
    ## check if the input file has the required information
    if(sum(!c("snp", "chr", "a1", "a2", "snpid") %in% names(tbl)) > 0) {
      stop("'filename' must be a table containing 'snp' and 'chr' columns.")
    }
    snpid.index <- seq(nrow(tbl))
  } else {
    if(is.null(snpids)) {
      stop("either 'snpids' should be a vector, or 'filename' should be the file name that contains the SNP information.")
    }
    snpid.index <- seq_along(snpids)
    ## load the corresponding snp library
    library(package = snp.lib, character.only = TRUE)
    rsid.missing.all <- NULL
    snps <- get(snp.lib)
    snp.loc <- tryCatch({snpsById(snps, snpids, ifnotfound="error")}, error = function(e) return(e$message))
    ## remove rsids not included in the database
    if(is(snp.loc, "character")) {
      rsid.missing <- myStrSplit(snp.loc, split = c(": ", "\n"))[[1]][-1]
      rsid.missing.all <- myStrSplit(rsid.missing, split = c(",", " "))[[1]]
      snpids <- snpids[!snpids %in% rsid.missing]
      snp.loc <- snpsById(snps, snpids, ifnotfound="drop")
    } 
    
    if(!is.null(rsid.missing.all)) {
      message("Warning: the following rsids are not included in the database and discarded: ")
      message(paste(rsid.missing.all, collapse = ", "))
      rsid.missing <- rsid.missing.all
    }
    
    snp.alleles <- IUPAC_CODE_MAP[snp.loc@elementMetadata@listData$alleles_as_ambig]
    snp.strands<-as.character(snp.loc@strand)
    if(sum(nchar(snp.alleles) > 2) > 0) {
      message("Warning: the following SNPs have more than 2 alleles. All pairs of nucleotides are considered as pairs of the SNP and the reference allele:")
      rsid.duplicate <- snpids[nchar(snp.alleles) > 2]
      message(paste(rsid.duplicate, collapse = ", "))
    }
    ## retain only SNPs with >= 2 alleles
    tbl <- NULL
    for(nalleles in 2:4) {
      ids <- which(sapply(snp.alleles, nchar) == nalleles)
      if(length(ids) == 0) {
        next
      }
      snp.loc.n <- snp.loc[ids]
      snp.alleles.n <- snp.alleles[ids]
      snp.ids.n <- snpids[ids]
      snp.alleles.n <- strsplit(snp.alleles.n, "")
      snp.strands.n <- snp.strands[ids]
      ## get all pairs of alleles
      for(i_allele1 in seq(nalleles - 1)) {
        for(i_allele2 in (i_allele1 + 1):nalleles) {
          a1 <- sapply(snp.alleles.n, function(x) x[i_allele1])
          a2 <- sapply(snp.alleles.n, function(x) x[i_allele2])
          
          ## revert the alleles on the reverse strand
          id.rev <- which(snp.strands.n == "-")
          if(length(id.rev) > 0) {
            rev.codes <- c("A", "C", "G", "T")
            names(rev.codes) <- rev(rev.codes)
            a1[id.rev] <- rev.codes[a1[id.rev]]
            a2[id.rev] <- rev.codes[a2[id.rev]]
          }
          tbl <- rbind(tbl,
                       data.frame(snp = as.integer(snp.loc.n@ranges), 
                                  chr = as.character(paste0("chr", gsub("ch", "", snp.loc.n@seqnames))), 			  
                                  a1 = as.character(a1),
                                  a2 = as.character(a2),
                                  snpid = as.character(snp.ids.n),
                                  index = snpid.index[ids])
          )
        }
      }
    }
    
    tbl <- tbl[order(tbl$index), ]
    if(!is.null(filename)) {
      write.table(tbl[, c('snp', 'chr', 'a1', 'a2', 'snpid')], file = filename, row.names = FALSE, col.names = TRUE, quote = FALSE)
    }
    snpid.index <- tbl$index
  }
  
  ## load the corresponding genome version
  library(package = genome.lib, character.only = TRUE)
  species<-get(strsplit(genome.lib, "[.]")[[1]][2])
  seqvec <- getSeq(species,
                   as.character(tbl$chr),
                   start=tbl$snp - half.window.size,
                   end=tbl$snp + half.window.size,
                   as.character=TRUE)
  codes <- seq(4)
  names(codes) <- c("A", "C", "G", "T")
  sequences <- sapply(seqvec, function(x) codes[strsplit(x, "")[[1]]])
  sequences <- matrix(sequences, ncol = length(seqvec))
  snpid.output <- as.character(tbl$snpid)
  rownames(sequences) <- NULL
  a1 <- codes[as.character(tbl$a1)]
  a2 <- codes[as.character(tbl$a2)]
  names(a1) <- names(a2) <- NULL
  keep.id <- which(colSums(is.na(sequences)) == 0)
  if(length(keep.id) < nrow(tbl)) {
    message("Warning: the following rows are discarded because the reference genome sequences contain non ACGT characters:")
    rsid.na <- tbl[-keep.id, ]$snpid
    print(tbl[-keep.id, ])
  }
  ## remove sequences containing non ACGT characters
  sequences <- sequences[, keep.id, drop = FALSE]
  a1 <- a1[keep.id]
  a2 <- a2[keep.id]
  snpid.output <- snpid.output[keep.id]
  snpid.index <- snpid.index[keep.id]
  ## whether use the default parameters
  if(!default.par) {
    transition <- .Call("transition_matrix", sequences, package = "atSNP")
    prior <- rowSums(transition)
    prior <- prior / sum(prior)
    transition <- transition / rowSums(transition)
    names(prior) <- colnames(transition) <- rownames(transition) <- c("A", "C", "G", "T")
  } else {
    data(default_par, envir = environment())
  }
  if(!mutation) {
    ## real SNP data
    a1.ref.base.id <- which(a1 == sequences[half.window.size + 1, ])
    a2.ref.base.id <- which(a2 == sequences[half.window.size + 1, ])
    ## store SNPs that have the same base in SNP and REF alleles only once
    a2.ref.base.id <- a2.ref.base.id[!a2.ref.base.id %in% a1.ref.base.id]
    discard.id <- setdiff(seq_along(a1), c(a1.ref.base.id, a2.ref.base.id))
    if(length(discard.id) > 0) {
      message("Warning: the following sequences are discarded because the reference nucleotide matches to neither a1 nor a2:")
      rsid.rm <- unique(as.character(tbl[keep.id[discard.id], ]$snpid))
      message("snpid\tchr\tsnp\ta1\ta2")
      message(paste(apply(tbl[keep.id[discard.id], c("snpid", "chr", "snp", "a1", "a2")], 1, function(x) paste(x, collapse = "\t")), collapse = "\n"))
    }
  } else {
    ## single nucleotide mutation data
    a1.ref.base.id <- seq_along(a1)
    a2.ref.base.id <- numeric(0)
  }
  sequences <- sequences[, c(a1.ref.base.id, a2.ref.base.id), drop = FALSE]
  snpid.output <- snpid.output[c(a1.ref.base.id, a2.ref.base.id)]
  ref.base <- c(a1[a1.ref.base.id], a2[a2.ref.base.id])
  snp.base <- c(a2[a1.ref.base.id], a1[a2.ref.base.id])
  snpid.index <- snpid.index[c(a1.ref.base.id, a2.ref.base.id)]
  ## Keep the order of SNPs as in the input file
  if(useFile) {
    output.index = seq(ncol(sequences))
  } else {
    output.index = order(snpid.index)
  }
  return(list(
    sequence_matrix= matrix(sequences[, output.index], nrow=2*half.window.size+1),
    ref_base = ref.base[output.index],
    snp_base = snp.base[output.index],
    snpids = snpid.output[output.index],
    transition = transition,
    prior = prior,
    rsid.na = rsid.na,
    rsid.rm = rsid.rm,
    rsid.duplicate = rsid.duplicate,
    rsid.missing = rsid.missing
  ))
}

#' @name LoadFastaData
#' @title Load the SNP data from fasta files.
#' @description Load SNP data.
#' @param ref.filename a fastq file name for the reference allele sequences.
#' @param snp.filename a fastq file name for the SNP allele sequences.
#'@param ref.urlname URL of a fastq file for the reference allele sequences.
#' @param snp.urlname URL of a fastq file for the SNP allele sequences.
#' @param snpids SNP IDs			  
#' @param default.par A boolean for whether using the default Markov parameters.
#'  Default: FALSE.
#' @return A list object containing the following components:
#' \tabular{ll}{
#' sequence_matrix \tab A list of integer vectors representing the deroxyribose 
#' sequence around each SNP.\cr
#' a1 \tab An integer vector for the deroxyribose at the SNP location on the 
#' reference genome.\cr
#' a2 \tab An integer vector for the deroxyribose at the SNP location on the SNP
#'  genome.\cr
#' }
#' The results are coded as: "A"-1, "C"-2, "G"-3, "T"-4.
#' @author Sunyoung Shin \email{sunyoung.shin@@utdallas.edu}, Chandler Zuo 
#' \email{chandler.c.zuo@@gmail.com}
#' @examples LoadFastaData(
#' ref.urlname="http://pages.stat.wisc.edu/~keles/atSNP-Data/sample_1.fasta",
#' snp.urlname="http://pages.stat.wisc.edu/~keles/atSNP-Data/sample_2.fasta")
#' @useDynLib atSNP
#' @import BiocFileCache
#' @import rappdirs
#' @export
LoadFastaData <- function(ref.filename = NULL, snp.filename = NULL, 
                          ref.urlname = NULL, snp.urlname = NULL,
                          snpids = NULL, default.par = FALSE) {
  if ( is.null(ref.filename) & is.null(ref.urlname) )  {
    stop("one argument among 'ref.filename' and 'ref.urlname' should be provided.")
  } else {
    if(!is.null(ref.filename) & !is.null(ref.urlname))
      stop("only one argument among 'ref.filename' and 'ref.urlname' should be provided.")
  }
  if(is.null(ref.filename)) {
    bfc <- BiocFileCache(cache = user_cache_dir(appname = "BiocFileCache"), ask = FALSE)
    ref.rid <- bfcrid(bfcquery(bfc, query=basename(ref.urlname), exact=TRUE, field="rname"))
    if (!length(ref.rid))
      ref.rid <- names(bfcadd(bfc, rname=basename(ref.urlname), ref.urlname))
    refdat <- read.table(bfcrpath(rids=ref.rid))
  } else {
    refdat <- read.table(ref.filename)
  }
  
  if ( is.null(snp.filename) & is.null(snp.urlname) )  {
    stop("one argument among 'snp.filename' and 'snp.urlname' should be provided.")
  } else {
    if(!is.null(snp.filename) & !is.null(snp.urlname))
      stop("only one argument among 'snp.filename' and 'snp.urlname' should be provided.")
  }
  if(is.null(snp.filename)) {
    bfc <- BiocFileCache(cache = user_cache_dir(appname = "BiocFileCache"), ask = FALSE)
    snp.rid <- bfcrid(bfcquery(bfc, query=basename(snp.urlname), exact=TRUE, field="rname"))
    if (!length(snp.rid))
      snp.rid <- names(bfcadd(bfc, rname=basename(snp.urlname), snp.urlname))
    snpdat <- read.table(bfcrpath(rids=snp.rid))
  } else {
    snpdat <- read.table(snp.filename)
  }

  if(nrow(refdat) != nrow(snpdat)) {
    stop("'ref.data' and 'snp.data' should have the same number of rows.")
  }
  n <- nrow(refdat)
  if(n%%2==1) {
    stop("'ref.data' and 'snp.data' should have an even number of rows.")
    }

  ids <- 2 * seq(n / 2)
  refseqs <- as.character(refdat[ids, 1])
  snpseqs <- as.character(snpdat[ids, 1])

  if(is.null(snpids)) {
   snpids<-gsub(">", "", as.character(refdat[ids-1,1]))
 }

  if(var(sapply(refseqs, nchar)) != 0) {
    stop("sequences in 'ref.data' have different lengths.")
  }
  if(var(sapply(snpseqs, nchar)) != 0) {
    stop("sequences in 'snp.data' have different lengths.")
  }
  codes <- seq(4)
  names(codes) <- c("A", "C", "G", "T")
  refmat <- sapply(refseqs,
                   function(x)
                   codes[strsplit(x, "")[[1]]])
  snpmat <- sapply(snpseqs,
                   function(x)
                   codes[strsplit(x, "")[[1]]])
  colnames(refmat) <- colnames(snpmat) <- rownames(refmat) <- rownames(snpmat) <- NULL
  m <- nrow(refmat)
  if(nrow(refmat) != nrow(snpmat)) {
    stop("the sequences for the SNP alleles and the reference alleles have different lengths.")
  }
  if(m %% 2 == 0) {
    stop("each sequence must have an odd number of length.")
  }

  id.na1 <- which(apply(refmat, 2, function(x) sum(is.na(x))) > 0)
  id.na2 <- which(is.na(snpmat[(m + 1) / 2, ]))
  id.na <- union(id.na1, id.na2)
  if(length(id.na) > 0) {
    message("Warning: the following sequences are discarded, because they include bases other than A, C, G, T: ")
    message(paste(id.na, collapse = ", "))
  }

  id.wrong <- which(apply((refmat != snpmat)[-(m + 1) / 2, ], 2, sum) > 0)
  if(length(id.wrong) > 0) {
    message("Warning: the following sequences are discarded, because they have unidentical nucleotides between the SNP and the reference allele at positions other than the central location: ")
    message(paste(id.wrong, collapse = ", "))
  }

  ids <- union(id.wrong, id.na)
  if(length(ids) > 0) {
    sequences <- refmat[, -ids, drop = FALSE]
    ref.base <- refmat[(m + 1) / 2, -ids]
    snp.base <- snpmat[(m + 1) / 2, -ids]
  } else {
    sequences <- refmat
    ref.base <- refmat[(m + 1) / 2, ]
    snp.base <- snpmat[(m + 1) / 2, ]
  }

  if(!default.par) {
    transition <- .Call("transition_matrix", sequences, package = "atSNP")
    prior <- rowSums(transition)
    prior <- prior / sum(prior)
    transition <- transition / rowSums(transition)
    names(prior) <- colnames(transition) <- rownames(transition) <- c("A", "C", "G", "T")
  } else {
    data(default_par, envir = environment())
  }
  colnames(sequences)<-snpids

  return(list(
              sequence_matrix= sequences,
              ref_base = ref.base,
              snp_base = snp.base,
              transition = transition,
              prior = prior, snpids = snpids
              ))
}

#' @name ComputeMotifScore
#' @title Compute the scores for SNP effects on motifs.
#' @description Compute the log-likelihood scores for motifs.
#' @param motif.lib A list object with the output format of function 
#' \code{\link{LoadMotifLibrary}}.
#' @param snp.info A list object with the output format of function 
#' \code{\link{LoadSNPData}}.
#' @param ncores An integer for the number of parallel process. Default: 1.
#' @details This function computes the binding affinity scores for both alleles 
#' at each SNP window. For each pair of SNP and motif, it finds the subsequence 
#' from both strand that maximizes the affinity binding score. It returns both 
#' the matching positions and the maximized affinity scores.
#' @return A list of two data.frame's. Field \code{snp.tbl} contains:
#' \tabular{cc}{
#' snpid \tab SNP id.\cr
#' ref_seq \tab Reference allele nucleotide sequence.\cr
#' snp_seq \tab SNP allele nucleotide sequence.\cr
#' ref_seq_rev \tab Reference allele nucleotide sequence on the reverse 
#' strand.\cr
#' snp_seq_rev \tab SNP allele nucleotide sequence on the reverse strand.\cr}
#' Field \code{motif.score} contains:
#' \tabular{cc}{
#' motif \tab Name of the motif.\cr
#' motif_len \tab Length of the motif.\cr
#' ref_start, ref_end, ref_strand \tab Location of the best matching subsequence
#'  on the reference allele.\cr
#' snp_start, snp_end, snp_strand \tab Location of the best matching subsequence
#'  on the SNP allele.\cr
#' log_lik_ref \tab Log-likelihood score for the reference allele.\cr
#' log_lik_snp \tab Log-likelihood score for the SNP allele.\cr
#' log_lik_ratio \tab The log-likelihood ratio.\cr
#' log_enhance_odds \tab Difference in log-likelihood ratio between SNP allele 
#' and reference allele based on the best matching subsequence on the reference 
#' allele.\cr
#' log_reduce_odds \tab Difference in log-likelihood ratio between reference 
#' allele and SNP allele based on the best matching subsequence on the SNP 
#' allele.\cr
#' }
#' @author Sunyoung Shin \email{sunyoung.shin@@utdallas.edu}, Chandler Zuo
#' \email{chandler.c.zuo@@gmail.com}
#' @examples
#' data(example)
#' ComputeMotifScore(motif_library, snpInfo, ncores = 2)
#' @useDynLib atSNP
#' @import data.table
#' @importFrom BiocParallel bpmapply MulticoreParam SnowParam
#' @export
ComputeMotifScore <- function(motif.lib, snp.info, ncores = 1) {
  ## check arguments
  if(sum(!unlist(sapply(motif.lib, is.matrix))) > 0 | sum(unlist(sapply(motif.lib, ncol)) != 4) > 0) {
    stop("'motif.lib' must be a list of numeric matrices each with 4 columns.")
  }
  if(sum(!c("sequence_matrix", "snp_base", "ref_base") %in% names(snp.info)) > 0) {
    stop("'snp.info' must contain three components: 'ref_base', 'snp_base', 'sequence_matrix'.")
  }
  if(ncol(snp.info$sequence_matrix) != length(snp.info$ref_base) | length(snp.info$ref_base) != length(snp.info$snp_base)) {
    stop("the number of columns of 'snp.info$sequence_matrix', the length of 'snp.info$ref_base' and the length of 'snp.info$snp_base' must be the same.")
  }
  if( ! all( sort(unique(c(c(snp.info$sequence_matrix), snp.info$ref_base, snp.info$snp_base))) %in% seq(4) )) {
    stop("'snp.info$sequence_matrix', 'snp.info$ref_base', 'snp.info$snp_base' can only contain entries in 1, 2, 3, 4.")
  }
  if(nrow(snp.info$sequence_matrix) / 2 == as.integer(nrow(snp.info$sequence_matrix) / 2)) {
    stop("'snp.info$sequence_matrix' must have an odd number of rows so that the central row refers to the SNP nucleotide.")
  }

  motifs <- names(motif.lib)
  snpids <- snp.info$snpids
  snpbases<-ifelse(snp.info$snp_base==1, "A", ifelse(snp.info$snp_base==2, "C", ifelse(snp.info$snp_base==3, "G", "T")))	
  nmotifs <- length(motif.lib)
  len_seq <- nrow(snp.info$sequence_matrix)
  snp.info$sequence_matrix[(len_seq+1)/2,]<-snp.info$ref_base
  ncores <- min(c(ncores, length(snp.info$ref_base)))

  k <- as.integer(length(snp.info$ref_base) / ncores)
  if(ncores > 1) {
    if(Sys.info()[["sysname"]] == "Windows"){
      snow <- SnowParam(workers = ncores, type = "SOCK")
      motif_score_par_list <- bpmapply(function(x) motif_score_par(i=x, par.k=k, par.ncores=ncores, par.motifs=motifs, par.nmotifs=nmotifs, par.snpids=snpids,  par.snpbases=snpbases, par.len_seq=len_seq, par.motif.lib=motif.lib, par.snp.info=snp.info), seq(ncores), BPPARAM = snow,SIMPLIFY = FALSE)
    } else {
      motif_score_par_list <- bpmapply(function(x) motif_score_par(i=x, par.k=k, par.ncores=ncores, par.motifs=motifs, par.nmotifs=nmotifs, par.snpids=snpids,  par.snpbases=snpbases, par.len_seq=len_seq, par.motif.lib=motif.lib, par.snp.info=snp.info), seq(ncores), BPPARAM = MulticoreParam(workers = ncores),
                                SIMPLIFY = FALSE)
    } 
  }
    else {
    motif_score_par_list <- list(motif_score_par(i=1, par.k=k, par.ncores=ncores, par.motifs=motifs, par.nmotifs=nmotifs, 
                                                 par.snpids=snpids,  par.snpbases=snpbases, par.len_seq=len_seq, 
                                                 par.motif.lib=motif.lib, par.snp.info=snp.info))
   }

  motif.scores_dt <- motif_score_par_list[[1]]
  if(ncores > 1) {
    for(i in 2:ncores) {
      motif.scores_dt <- rbind(motif.scores_dt, motif_score_par_list[[i]])
    }
  }
#  setkey(motif.scores_dt, motif, snpid, snpbase)
  motif.scores<-as.data.frame(motif.scores_dt, stringAsFactors=FALSE)
  motif.scores<-motif.scores[order(motif.scores$motif, motif.scores$snpid, motif.scores$snpbase), ]
  # motif.scores[with(motif.scores, order(motif, snpid, snpbase)), ]
  ## sequences on the reference genome
  ref_seqs <- apply(snp.info$sequence_matrix, 2, function(x) paste(c("A", "C", "G", "T")[x], collapse = ""))
  ref_seqs_rev <- apply(snp.info$sequence_matrix, 2, function(x) paste(c("A", "C", "G", "T")[5 - rev(x)], collapse = ""))
  ## sequences on the snp allele
  id1 <- seq(as.integer(nrow(snp.info$sequence_matrix) / 2))
  id2 <- id1 + (nrow(snp.info$sequence_matrix) + 1) / 2
  snp_seqs <- apply(rbind(snp.info$sequence_matrix, snp.info$snp_base), 2,
                    function(x)
                    paste(c("A", "C", "G", "T")[x[c(id1, length(x), id2)]],
                          collapse = "")
                    )
  snp_seqs_rev <- apply(rbind(snp.info$sequence_matrix, snp.info$snp_base), 2,
                    function(x)
                    paste(c("A", "C", "G", "T")[5 - rev(x[c(id1, length(x), id2)])],
                          collapse = "")
                    )

#  snp_tbl <- data.table(snpid = snpids,
#                        ref_seq = ref_seqs,
#                        snp_seq = snp_seqs,
#                        ref_seq_rev = ref_seqs_rev,
#                        snp_seq_rev = snp_seqs_rev,
#                        snpbase=snpbases)
#  setkey(snp_tbl, snpid, snpbase)
  snp_tbl <- data.frame(snpid = snpids, ref_seq = ref_seqs, snp_seq = snp_seqs, ref_seq_rev = ref_seqs_rev, snp_seq_rev = snp_seqs_rev, snpbase = snpbases, stringsAsFactors=FALSE)
  snp_tbl[with(snp_tbl, order(snpid, snpbase)),]
  return(list(snp.tbl = snp_tbl, motif.scores = motif.scores))
}


#' @name MatchSubsequence
#' @title Compute the matching subsequence.
#' @description This function combines the SNP set, the motif library and the 
#' affinity score table and produce the matching subsequence found at each SNP 
#' location for each motif.
#' @param snp.tbl A data.frame with the following information:
#' \tabular{cc}{
#' snpid \tab SNP id.\cr
#' ref_seq \tab Reference allele nucleotide sequence.\cr
#' snp_seq \tab SNP allele nucleotide sequence.\cr
#' ref_seq_rev \tab Reference allele nucleotide sequence on the reverse 
#' strand.\cr
#' snp_seq_rev \tab SNP allele nucleotide sequence on the reverse strand.\cr}
#' @param motif.scores A data.frame with the following information:
#' \tabular{cc}{
#' motif \tab Name of the motif.\cr
#' motif_len \tab Length of the motif.\cr
#' ref_start, ref_end, ref_strand \tab Location of the best matching subsequence
#'  on the reference allele.\cr
#' snp_start, snp_end, snp_strand \tab Location of the best matching subsequence
#'  on the SNP allele.\cr
#' log_lik_ref \tab Log-likelihood score for the reference allele.\cr
#' log_lik_snp \tab Log-likelihood score for the SNP allele.\cr
#' log_lik_ratio \tab The log-likelihood ratio.\cr
#' log_enhance_odds \tab Difference in log-likelihood ratio between SNP allele 
#' and reference allele based on the best matching subsequence on the reference 
#' allele.\cr
#' log_reduce_odds \tab Difference in log-likelihood ratio between reference 
#' allele and SNP allele based on the best matching subsequence on the SNP 
#' allele.\cr
#' }
#' @param motif.lib A list of the position weight matrices for the motifs.
#' @param snpids A subset of snpids to compute the subsequences. Default: NULL, 
#' when all snps are computed.
#' @param motifs A subset of motifs to compute the subsequences. Default: NULL, 
#' when all motifs are computed.
#' @param ncores The number of cores used for parallel computing.
#' @return A data.frame containing all columns in both \code{snp.tbl} and 
#' \code{motif.scores}. In addition, the following columns are added:
#' \tabular{ll}{
#' ref_match_seq \tab Best matching subsequence on the reference allele.\cr
#' snp_match_seq \tab Best matching subsequence on the SNP allele.\cr
#' ref_seq_snp_match \tab Subsequence on the reference allele corresponding to 
#' the best matching location on the SNP allele.\cr
#' snp_seq_ref_match \tab Subsequence on the SNP allele corresponding to the 
#' best matching location on the reference allele.\cr
#' }
#' @author Sunyoung Shin \email{sunyoung.shin@@utdallas.edu}, Chandler Zuo 
#' \email{chandler.c.zuo@@gmail.com}
#' @examples
#' data(example)
#' MatchSubsequence(motif_scores$snp.tbl, motif_scores$motif.scores, 
#' motif_library, ncores=2)
#' @useDynLib atSNP
#' @import data.table
#' @importFrom BiocParallel bpmapply MulticoreParam SnowParam
#' @export
MatchSubsequence <- function(snp.tbl, motif.scores, motif.lib, snpids = NULL, motifs = NULL, ncores = 1) {
  if(is.null(snpids)) {
    snpids <- unique(snp.tbl$snpid)
  }
  if(is.null(motifs)) {
    motifs <- unique(motif.scores$motif)
  }
  if(sum(! motifs %in% names(motif.lib)) > 0) {
    motif.discard <- setdiff(motifs, unique(names(motif.lib)))
    message("Warning: the following motifs are not included in 'motif.lib' and are discarded: ")
    message(paste(motif.discard, collapse = ", "))
    motifs <- setdiff(motifs, motif.discard)
  }
  if(sum(! snpids %in% motif.scores$snpid) > 0) {
    snp.discard <- setdiff(snpids, unique(motif.scores$snpid))
    message("Warning: the following snpids are not included in 'motif.scores' and are discarded: ")
    message(paste(snp.discard, collapse = ", "))
    snpids <- setdiff(snpids, snp.discard)
  }
  snpids <- unique(snpids)
  motifs <- unique(motifs)
  motif.scores_dt<-as.data.table(motif.scores)
  setkey(motif.scores_dt, motif, snpid, snpbase)
  snp.tbl <- as.data.table(snp.tbl)
  setkey(snp.tbl, snpid, snpbase)
  motif.scores <- motif.scores_dt[snpid %in% snpids & motif %in% motifs, ]
  snp.tbl <- snp.tbl[snpid %in% snpids, ]
  snp.tbl[, len_seq := nchar(ref_seq)]

  ## get the IUPAC subsequence for the motifs
  motif.tbl <- data.table(
    motif = motifs,
    IUPAC = sapply(motif.lib[motifs],
      function(x) GetIUPACSequence(x, prob = 0.25))
  )
  setkey(motif.tbl, motif)
  setkey(snp.tbl, snpid, snpbase)
  
  ncores <- min(c(ncores, length(snpids)))

    k <- as.integer(length(snpids) / ncores)

      if(ncores > 1) {
    if(Sys.info()[["sysname"]] == "Windows"){
      snow <- SnowParam(workers = ncores, type = "SOCK")
      motif_score_par_list<-bpmapply(function(x) match_subseq_par(i=x, par.k=k, par.ncores=ncores, par.snp.tbl=snp.tbl, par.snpids=snpids, par.motif.scores=motif.scores, par.motif=motif, par.motif.tbl=motif.tbl), seq(ncores), BPPARAM = snow,SIMPLIFY = FALSE)
    }else{
      motif_score_par_list<-bpmapply(function(x) match_subseq_par(i=x, par.k=k, par.ncores=ncores, par.snp.tbl=snp.tbl, par.snpids=snpids, par.motif.scores=motif.scores, par.motif=motif, par.motif.tbl=motif.tbl), seq(ncores), BPPARAM = MulticoreParam(workers = ncores),
                                SIMPLIFY = FALSE)
    }
  } else 
  {
    motif_score_par_list<-list(match_subseq_par(1, par.k=k, par.ncores=ncores, par.snp.tbl=snp.tbl, 
                                                par.snpids=snpids, par.motif.scores=motif.scores,
                                                par.motif=motif, par.motif.tbl=motif.tbl))
  }
  
  motif_score_tbl <- motif_score_par_list[[1]]
  if(ncores > 1) {
    for(i in 2:ncores) {
      motif_score_tbl <- rbind(motif_score_tbl,
                               motif_score_par_list[[i]])
    }
  }
 motif_score_tbl<-as.data.frame(motif_score_tbl, stringAsFacotrs=FALSE)
  return(motif_score_tbl)
}

#' @name ComputePValues
#' @title Compute p-values for affinity scores.
#' @description This function computes the p-values for allele-specific affinity
#'  scores and between-allele affinity score changes using the importance 
#'  sampling technique.
#' @param motif.lib A list object with the output format of function 
#' \code{\link{LoadMotifLibrary}}.
#' @param snp.info A list object with the output format of function 
#' \code{\link{LoadSNPData}}.
#' @param motif.scores A data.frame object containing at least the following 
#' columns:
#' \tabular{ll}{
#' motif \tab The name of the motif.\cr
#' log_lik_ref \tab The log-likelihood score for the reference allele.\cr
#' log_lik_snp \tab The log-likelihood score for the SNP allele.\cr
#' }
#' @param ncores An integer for the number of parallel process. Default: 1.
#' @param  testing.mc Monte Carlo sample size of 200 is considered. Do not
#'  change the default unless conducting a quick test. Default: FALSE
#' @param figdir A string for the path to print p-value plots for monitoring 
#' results. Default: NULL (no figure).
#' @return A data.frame extending \code{motif.scores} by the following 
#' additional columns:
#' \tabular{ll}{
#' pval_ref \tab P-values for scores on the reference allele.\cr
#' pval_snp \tab P-values for scores on the SNP allele.\cr
#' pval_cond_ref \tab Conditional p-values for scores on the reference 
#' allele.\cr
#' pval_cond_snp \tab Conditional p-values for scores on the SNP allele.\cr
#' pval_diff \tab P-values for the difference in scores between the reference 
#' and the SNP alleles.\cr
#' pval_rank \tab P-values for the log rank ratio between the reference and the 
#' SNP alleles.\cr
#' }
#' @author Sunyoung Shin \email{sunyoung.shin@@utdallas.edu}, Chandler Zuo 
#' \email{chandler.c.zuo@@gmail.com}
#' @examples
#' data(example)
#' ComputePValues(motif_library, snpInfo, motif_scores$motif.scores, ncores = 2, testing.mc=TRUE)
#' @import Rcpp
#' @import data.table
#' @import ggplot2
#' @importFrom BiocParallel bpmapply MulticoreParam SnowParam
#' @useDynLib atSNP
#' @export
ComputePValues <- function(motif.lib, snp.info, motif.scores, ncores = 1, testing.mc=FALSE, figdir = NULL) {
  ncores <- min(c(ncores, length(motif.lib)))
  results <- as.list(seq_along(motif.lib))
  nsets <- as.integer(length(motif.lib) / ncores)
  motif.scores <- as.data.table(motif.scores)
  prior <- snp.info$prior
  transition <- snp.info$transition
  
  if(Sys.info()[["sysname"]] == "Windows"){
    snow <- SnowParam(workers = ncores, type = "SOCK")
    results<-bpmapply(function(x) results_motif_par(motif.id=x, par.prior=prior, par.transition=transition, par.motif.lib=motif.lib, par.motif.scores=motif.scores, par.testing.mc=testing.mc, par.figdir=figdir), seq_along(motif.lib), BPPARAM = snow,SIMPLIFY = FALSE)
  }else{
    results<-bpmapply(function(x) results_motif_par(motif.id=x, par.prior=prior, par.transition=transition, par.motif.lib=motif.lib, par.motif.scores=motif.scores, par.testing.mc=testing.mc, par.figdir=figdir), seq_along(motif.lib), BPPARAM = MulticoreParam(workers = ncores),
                      SIMPLIFY = FALSE)
  }

  motif.scores_dt<-as.data.table(motif.scores)
  setkey(motif.scores_dt, motif, snpid, snpbase)
  
  for(i in seq_along(results)) {
    motif.scores_dt[results[[i]]$rowids, pval_ref := results[[i]]$pval_a[, 1]]
    motif.scores_dt[results[[i]]$rowids, pval_snp := results[[i]]$pval_a[, 2]]
    motif.scores_dt[results[[i]]$rowids, pval_cond_ref := results[[i]]$pval_cond[, 1]]
    motif.scores_dt[results[[i]]$rowids, pval_cond_snp := results[[i]]$pval_cond[, 2]]
    motif.scores_dt[results[[i]]$rowids, pval_diff := results[[i]]$pval_diff[, 1]]
    motif.scores_dt[results[[i]]$rowids, pval_rank := results[[i]]$pval_rank[, 1]]
  }
  motif.scores.pval<-as.data.frame(motif.scores_dt, stringsAsFactors=FALSE)
  return(motif.scores.pval)
}

#' @name GetIUPACSequence
#' @title Get the IUPAC sequence of a motif.
#' @description Convert the posotion weight matrix of a motif to the IUPAC 
#' sequence.
#' @param pwm The position weight matrix, with the columns representing 
#' A, C, G, T.
#' @param prob The probability threshold. Default: 0.25.
#' @return A character string.
#' @author Sunyoung Shin \email{sunyoung.shin@@utdallas.edu}, Chandler Zuo 
#' \email{chandler.c.zuo@@gmail.com}
#' @examples
#' data(example)
#' GetIUPACSequence(motif_library[[1]], prob = 0.2)
#' @export
GetIUPACSequence <- function(pwm, prob = 0.25) {
  iupac.table <-
    c(".", "A", "C", "M", "G", "R", "S", "V", "T", "W", "Y", "H", "K", "D", "B", "N")
  iupac.value <- (pwm >= prob) %*% c(1, 2, 4, 8) + 1
  return(paste(iupac.table[iupac.value], collapse = ""))
}
Any scripts or data that you put into this service are public.
atSNP documentation built on April 28, 2020, 6:50 p.m.
rdrr.io home R language documentation Run R code online
CRAN packages Bioconductor packages R-Forge packages GitHub packages
Note that we can't provide technical support on individual packages. You should contact the package authors for that.
atSNP
Affinity test for identifying regulatory SNPs

R/motif_analysis.R
In atSNP: Affinity test for identifying regulatory SNPs

Defines functions GetIUPACSequence ComputePValues MatchSubsequence ComputeMotifScore LoadFastaData LoadSNPData LoadMotifLibrary

Documented in ComputeMotifScore ComputePValues GetIUPACSequence LoadFastaData LoadMotifLibrary LoadSNPData MatchSubsequence

Try the atSNP package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

atSNP Affinity test for identifying regulatory SNPs

R/motif_analysis.R In atSNP: Affinity test for identifying regulatory SNPs

Defines functions GetIUPACSequence ComputePValues MatchSubsequence ComputeMotifScore LoadFastaData LoadSNPData LoadMotifLibrary

Documented in ComputeMotifScore ComputePValues GetIUPACSequence LoadFastaData LoadMotifLibrary LoadSNPData MatchSubsequence

Try the atSNP package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

atSNP
Affinity test for identifying regulatory SNPs

R/motif_analysis.R
In atSNP: Affinity test for identifying regulatory SNPs