Nothing
#' @name LoadMotifLibrary
#' @title Load position weight matrices.
#' @description Load the file for position weight matrices for motifs.
#' @param filename a MEME format file name.
#' @param urlname URL containing a MEME format file.
#' @param tag A string that marks the description line of the position weight
#' matrix.
#' @param skiprows Number of description lines before each position weight
#' matrix.
#' @param skipcols Number of columns to be skipped in the position weight
#' matrix.
#' @param transpose If TRUE (default), then the position weight matrix should
#' have 4 columns. Otherwise, it should have 4 rows.
#' @param field The index of the field in the description line, seperated by
#' space, that indicates the motif name.
#' @param sep A vector of chars for the string separators to parse each lines of
#' the matrix. Default: c(" ", "\\t").
#' @param pseudocount An integer for the pseudocount added to each of the
#' original matrices. Default: 0. Recommended to be 1 if the original matrices
#' are position frequency matrices.
#' @details This function reads the formatted file containing motif information
#' and convert them into a list of position weight matrices. The list of
#' arguments should provide enough flexibility of importing a varying number of
#' formats. Some examples are the following:
#' For MEME format, the suggested arguments are: tag = 'Motif', skiprows = 2,
#' skipcols = 0, transpose = FALSE, field = 2, sep = ' ';
#' For motif files from JOHNSON lab (i.e.
#' http://johnsonlab.ucsf.edu/mochi_files/JASPAR_motifs_H_sapiens.txt),
#' the suggested arguments are: tag = '/NAME', skiprows = 1, skipcols = 0,
#' transpose = FALSE, field = 2, sep = "\\t";
#' For JASPAR pfm matrices (i.e. http://jaspar.genereg.net/download/CORE/JASPAR
#' 2018_CORE_vertebrates_non-redundant_pfms_jaspar.txt), the suggested arguments
#' are: tag = ">", skiprows = 1, skipcols = 0, transpose = TRUE, field = 1,
#' sep = "\\t"; For the TRANSFAC library provided by UCF bioinformatics groups
#' (i.e. http://gibbs.biomed.ucf.edu/PreDREM/download/nonredundantmotif.transfac
#' ), the suggested arguments are: tag = "DE", skiprows = 1, skipcols = 1,
#' transpose = FALSE, field = 2, sep = "\\t".
#' @return A list object of position weight matrices.
#' @author Sunyoung Shin \email{sunyoung.shin@@utdallas.edu}, Chandler Zuo
#' \email{chandler.c.zuo@@gmail.com}
#' @examples
#' pwms <- LoadMotifLibrary(
#' urlname="http://pages.stat.wisc.edu/~keles/atSNP-Data/pfm_vertebrates.txt",
#' tag = ">", transpose = FALSE, field = 1, sep = c("\t", " ", ">"),
#' skipcols = 1, skiprows = 1, pseudocount = 1)
#' @useDynLib atSNP
#' @import BiocFileCache
#' @import rappdirs
#' @export
LoadMotifLibrary <- function(filename = NULL, urlname = NULL, tag = "MOTIF", transpose = FALSE, field = 2, sep = c("\t", " "), skipcols = 0, skiprows = 2, pseudocount = 0) {
if ( is.null(filename) & is.null(urlname) ) {
stop("one argument among 'filename' and 'urlname' should be provided.")
} else {
if(!is.null(filename) & !is.null(urlname))
stop("only one argument among 'filename' and 'urlname' should be provided.")
}
if(is.null(filename)) {
bfc <- BiocFileCache(cache = user_cache_dir(appname = "BiocFileCache"), ask = FALSE)
rid <- bfcrid(bfcquery(bfc, query=basename(urlname), exact=TRUE, field="rname"))
if (!length(rid))
rid <- names(bfcadd(bfc, rname=basename(urlname), urlname))
lines<-readLines(bfcrpath(rids=rid))
} else {
lines<-readLines(filename)
}
motifLineNums <- grep(tag, lines)
if(length(myStrSplit(lines[motifLineNums[1]], sep)[[1]]) >= field) {
motifnames <-
sapply(myStrSplit(lines[motifLineNums], sep), function(x) x[field])
} else {
motifnames <-
sapply(myStrSplit(lines[motifLineNums], sep), function(x) x[field])
}
allmats <- as.list(seq_along(motifnames))
for(matrixId in seq_along(motifLineNums)) {
motifLineNum <- motifLineNums[matrixId] + skiprows
if(!transpose) {
pwm <- NULL
nrows <- 0
tmp <- myStrSplit(lines[nrows + motifLineNum], split = sep)[[1]]
tmp <- tmp[nchar(tmp) > 0]
while(length(tmp) >= 4 + skipcols) {
tmp <- as.numeric(tmp[skipcols + seq(4)])
if(sum(is.na(tmp)) == 0) {
pwm <- rbind(pwm, tmp)
}
nrows <- nrows + 1
tmp <- myStrSplit(lines[nrows + motifLineNum], split = sep)[[1]]
tmp <- tmp[nchar(tmp) > 0]
}
} else {
nrows <- 4
if(skipcols == 0) {
pwm <-
matrix(as.numeric(unlist(myStrSplit(lines[seq(nrows) + motifLineNum - 1], split = sep))), ncol = 4)
} else {
pwm <-
matrix(as.numeric(unlist(sapply(myStrSplit(lines[seq(nrows) + motifLineNum - 1], split = sep), function(x) x[-seq(skipcols)]))), ncol = 4)
}
}
pwm <- pwm + pseudocount
pwm <- pwm / apply(pwm, 1, sum)
pwm <- t(apply(pwm, 1,
function(x) {
x[x < 1e-10] <- 1e-10 / (1 - 1e-10 * sum(x < 1e-10)) * sum(x)
return(x / sum(x))
}))
rownames(pwm) <- NULL
allmats[[matrixId]] <- pwm
}
names(allmats) <- motifnames
return(allmats)
}
#' @name LoadSNPData
#' @title Load the SNP information and code the genome sequences around the SNP
#' locations.
#' @description Load the SNP data.
#' @param filename A table containing the SNP information. Must contain at least
#' five columns with exactly the following names:
#' \tabular{ll}{
#' chr \tab chromosome.\cr
#' snp \tab The nucleotide position of the SNP.\cr
#' snpid \tab The names of the SNPs.\cr
#' a1 \tab The deoxyribose for one allele.\cr
#' a2 \tab The deoxyribose for the other allele.\cr
#' }
#' If this file exists already, it is used to extract the SNP information.
#' Otherwise, SNP information extracted using argument 'snpids' is outputted to
#' this file.
#' @param snpids A vector of rs ids for the SNPs. This argument is overidden
#' if the file with name \code{filename} exists.
#' @param snp.lib A string of the library name to obtain the SNP information
#' based on rs ids. Default: "SNPlocs.Hsapiens.dbSNP144.GRCh38".
#' @param genome.lib A string of the library name for the genome version.
#' Default: "BSgenome.Hsapiens.UCSC.hg38".
#' @param half.window.size An integer for the half window size around the SNP
#' within which the motifs are matched. Default: 30.
#' @param default.par A boolean for whether using the default Markov parameters.
#' Default: FALSE.
#' @param mutation A boolean for whether this is mutation data. See details for
#' more information. Default: FALSE.
#' @param ... Other parameters passed to \code{\link[utils]{read.table}}.
#' @details This function extracts the nucleotide sequence within a window
#' around each SNP and code them using 1-A, 2-C, 3-G, 4-T.\cr
#' There are two ways of obtaining the nucleotide sequences. If \code{filename}
#' is not NULL and the file exists, it should contain the positions and alleles
#' for each SNP. Based on such information, the sequences around SNP positions
#' are extracted using the Bioconductor annotation package specified by
#' \code{genome.lib}. Users should make sure that this annotation package
#' corresponds to the correct species and genome version of the actual data.
#' Alternatively, users can also provide a vector of rs ids via the argument
#' \code{snpids}. The SNP locations and allele information is then obtained via
#' the Bioconductor annotation package specified by \code{snp.lib}, and passed
#' on to the package specified by \code{genome.lib} to further obtain the
#' nucleotide sequences.\cr
#' If \code{mutation=FALSE} (default), this function assumes that the data is
#' for SNP analysis, and the reference genome should be consistent with either
#' the a1 or a2 nucleotide. When extracting the genome sequence around each SNP
#' position, this function compares the nucleotide at the SNP location on the
#' reference genome with both a1 and a2 to distinguish between the reference
#' allele and the SNP allele. If the nucleotide extracted from the reference
#' genome does not match either a1 or a2, the SNP is discarded. The discarded
#' SNPs are in the 'rsid.rm' field in the output.\cr
#' Alternatively, if \code{mutation=TRUE}, this function assumes that the data
#' is for general single nucleotide mutation analysis. After extracting the
#' genome sequence around each SNP position, it replaces the nucleotide at the
#' SNP location by the a1 nucleotide as the 'reference' allele sequence, and by
#' the a2 nucleotide as the 'snp' allele sequence. It does NOT discard the
#' sequence even if neither a1 or a2 matches the reference genome. When this
#' data set is used in other functions, such as \code{\link{ComputeMotifScore}},
#' \code{\link{ComputePValues}}, all the results (i.e. affinity scores and
#' their p-values) for the reference allele are indeed for the a1 allele, and
#' results for the SNP allele are indeed for the a2 allele.\cr
#' If the input is a list of rsid's, the SNP information extracted from
#' \code{snp.lib} may contain more than two alleles for a single location. For
#' such cases, \code{\link{LoadSNPData}} first extracts all pairs of alleles
#' associated with those locations. If 'mutation=TRUE', all those pairs are
#' considered as pairs of reference and SNP alleles, and their information is
#' contained in 'sequence_matrix', 'a1', 'a2' and 'snpid'. If 'mutation=FALSE',
#' \code{\link{LoadSNPData}} further filters these pairs based on whether one
#' allele matches to the reference genome nucleotide extracted from
#' \code{genome.lib}. Only those pairs with one allele matching the reference
#' genome nucleotide is considered as pairs of reference and SNP alleles, with
#' their information contained in 'sequence_matrix', 'a1', 'a2' and 'snpid'.\cr
#' @return A list object containing the following components:
#' \tabular{ll}{
#' sequence_matrix \tab A list of integer vectors representing the deroxyribose
#' sequence around each SNP.\cr
#' a1 \tab An integer vector for the deroxyribose at the SNP location on the
#' reference genome.\cr
#' a2 \tab An integer vector for the deroxyribose at the SNP location on the
#' SNP genome.\cr
#' snpid \tab A string vector for the SNP rsids.\cr
#' rsid.missing \tab If the data source is a list of rsids, this field records
#' rsids for SNPs that are discarded because they are not in the SNPlocs package.\cr
#' rsid.duplicate \tab If the data source is a list of rsids, this field records
#' rsids for SNPs that based on the SNPlocs package, this locus has more than
#' 2 alleles. \cr
#' rsid.na \tab This field records rsids for SNPs that are discarded because the
#' nucleotide sequences contain none ACGT characters.\cr
#' rsid.rm \tab If the data source is a table and \code{mutation=FALSE}, this
#' field records rsids for SNPs that are discarded because the nucleotide on the
#' reference genome matches neither 'a1' or 'a2' in the data source.\cr
#' }
#' The results are coded as: "A"-1, "C"-2, "G"-3, "T"-4.
#' @author Chandler Zuo \email{chandler.c.zuo@@gmail.com}
#' @examples
#' \dontrun{LoadSNPData(snpids = c("rs53576", "rs7412"),
#' genome.lib ="BSgenome.Hsapiens.UCSC.hg38", snp.lib =
#' "SNPlocs.Hsapiens.dbSNP144.GRCh38", half.window.size = 30, default.par = TRUE
#' , mutation = FALSE)}
#' @import BSgenome
#' @useDynLib atSNP
#' @export
LoadSNPData <- function(filename = NULL, genome.lib = "BSgenome.Hsapiens.UCSC.hg38",
snp.lib = "SNPlocs.Hsapiens.dbSNP144.GRCh38",
snpids = NULL, half.window.size = 30, default.par = FALSE,
mutation = FALSE, ...) {
useFile <- FALSE
rsid.rm <- rsid.missing <- rsid.duplicate <- rsid.na <- NULL
if(!is.null(filename)) {
if(file.exists(filename)) {
useFile <- TRUE
}
}
if(useFile) {
if(!is.null(snpids)) {
message("Warning: load SNP information from 'filename' only. The argument 'snpids' is overridden.")
}
tbl <- read.table(filename, header = TRUE, stringsAsFactors = FALSE, colClasses = c("character"), ...)
tbl<-tbl[c("snpid", "chr", "snp", "a1", "a2")]
tbl$snp<-as.integer(tbl$snp)
## check if the input file has the required information
if(sum(!c("snp", "chr", "a1", "a2", "snpid") %in% names(tbl)) > 0) {
stop("'filename' must be a table containing 'snp' and 'chr' columns.")
}
snpid.index <- seq(nrow(tbl))
} else {
if(is.null(snpids)) {
stop("either 'snpids' should be a vector, or 'filename' should be the file name that contains the SNP information.")
}
snpid.index <- seq_along(snpids)
## load the corresponding snp library
library(package = snp.lib, character.only = TRUE)
rsid.missing.all <- NULL
snps <- get(snp.lib)
snp.loc <- tryCatch({snpsById(snps, snpids, ifnotfound="error")}, error = function(e) return(e$message))
## remove rsids not included in the database
if(is(snp.loc, "character")) {
rsid.missing <- myStrSplit(snp.loc, split = c(": ", "\n"))[[1]][-1]
rsid.missing.all <- myStrSplit(rsid.missing, split = c(",", " "))[[1]]
snpids <- snpids[!snpids %in% rsid.missing]
snp.loc <- snpsById(snps, snpids, ifnotfound="drop")
}
if(!is.null(rsid.missing.all)) {
message("Warning: the following rsids are not included in the database and discarded: ")
message(paste(rsid.missing.all, collapse = ", "))
rsid.missing <- rsid.missing.all
}
snp.alleles <- IUPAC_CODE_MAP[snp.loc@elementMetadata@listData$alleles_as_ambig]
snp.strands<-as.character(snp.loc@strand)
if(sum(nchar(snp.alleles) > 2) > 0) {
message("Warning: the following SNPs have more than 2 alleles. All pairs of nucleotides are considered as pairs of the SNP and the reference allele:")
rsid.duplicate <- snpids[nchar(snp.alleles) > 2]
message(paste(rsid.duplicate, collapse = ", "))
}
## retain only SNPs with >= 2 alleles
tbl <- NULL
for(nalleles in 2:4) {
ids <- which(sapply(snp.alleles, nchar) == nalleles)
if(length(ids) == 0) {
next
}
snp.loc.n <- snp.loc[ids]
snp.alleles.n <- snp.alleles[ids]
snp.ids.n <- snpids[ids]
snp.alleles.n <- strsplit(snp.alleles.n, "")
snp.strands.n <- snp.strands[ids]
## get all pairs of alleles
for(i_allele1 in seq(nalleles - 1)) {
for(i_allele2 in (i_allele1 + 1):nalleles) {
a1 <- sapply(snp.alleles.n, function(x) x[i_allele1])
a2 <- sapply(snp.alleles.n, function(x) x[i_allele2])
## revert the alleles on the reverse strand
id.rev <- which(snp.strands.n == "-")
if(length(id.rev) > 0) {
rev.codes <- c("A", "C", "G", "T")
names(rev.codes) <- rev(rev.codes)
a1[id.rev] <- rev.codes[a1[id.rev]]
a2[id.rev] <- rev.codes[a2[id.rev]]
}
tbl <- rbind(tbl,
data.frame(snp = as.integer(snp.loc.n@ranges),
chr = as.character(paste0("chr", gsub("ch", "", snp.loc.n@seqnames))),
a1 = as.character(a1),
a2 = as.character(a2),
snpid = as.character(snp.ids.n),
index = snpid.index[ids])
)
}
}
}
tbl <- tbl[order(tbl$index), ]
if(!is.null(filename)) {
write.table(tbl[, c('snp', 'chr', 'a1', 'a2', 'snpid')], file = filename, row.names = FALSE, col.names = TRUE, quote = FALSE)
}
snpid.index <- tbl$index
}
## load the corresponding genome version
library(package = genome.lib, character.only = TRUE)
species<-get(strsplit(genome.lib, "[.]")[[1]][2])
seqvec <- getSeq(species,
as.character(tbl$chr),
start=tbl$snp - half.window.size,
end=tbl$snp + half.window.size,
as.character=TRUE)
codes <- seq(4)
names(codes) <- c("A", "C", "G", "T")
sequences <- sapply(seqvec, function(x) codes[strsplit(x, "")[[1]]])
sequences <- matrix(sequences, ncol = length(seqvec))
snpid.output <- as.character(tbl$snpid)
rownames(sequences) <- NULL
a1 <- codes[as.character(tbl$a1)]
a2 <- codes[as.character(tbl$a2)]
names(a1) <- names(a2) <- NULL
keep.id <- which(colSums(is.na(sequences)) == 0)
if(length(keep.id) < nrow(tbl)) {
message("Warning: the following rows are discarded because the reference genome sequences contain non ACGT characters:")
rsid.na <- tbl[-keep.id, ]$snpid
print(tbl[-keep.id, ])
}
## remove sequences containing non ACGT characters
sequences <- sequences[, keep.id, drop = FALSE]
a1 <- a1[keep.id]
a2 <- a2[keep.id]
snpid.output <- snpid.output[keep.id]
snpid.index <- snpid.index[keep.id]
## whether use the default parameters
if(!default.par) {
transition <- .Call("transition_matrix", sequences, package = "atSNP")
prior <- rowSums(transition)
prior <- prior / sum(prior)
transition <- transition / rowSums(transition)
names(prior) <- colnames(transition) <- rownames(transition) <- c("A", "C", "G", "T")
} else {
data(default_par, envir = environment())
}
if(!mutation) {
## real SNP data
a1.ref.base.id <- which(a1 == sequences[half.window.size + 1, ])
a2.ref.base.id <- which(a2 == sequences[half.window.size + 1, ])
## store SNPs that have the same base in SNP and REF alleles only once
a2.ref.base.id <- a2.ref.base.id[!a2.ref.base.id %in% a1.ref.base.id]
discard.id <- setdiff(seq_along(a1), c(a1.ref.base.id, a2.ref.base.id))
if(length(discard.id) > 0) {
message("Warning: the following sequences are discarded because the reference nucleotide matches to neither a1 nor a2:")
rsid.rm <- unique(as.character(tbl[keep.id[discard.id], ]$snpid))
message("snpid\tchr\tsnp\ta1\ta2")
message(paste(apply(tbl[keep.id[discard.id], c("snpid", "chr", "snp", "a1", "a2")], 1, function(x) paste(x, collapse = "\t")), collapse = "\n"))
}
} else {
## single nucleotide mutation data
a1.ref.base.id <- seq_along(a1)
a2.ref.base.id <- numeric(0)
}
sequences <- sequences[, c(a1.ref.base.id, a2.ref.base.id), drop = FALSE]
snpid.output <- snpid.output[c(a1.ref.base.id, a2.ref.base.id)]
ref.base <- c(a1[a1.ref.base.id], a2[a2.ref.base.id])
snp.base <- c(a2[a1.ref.base.id], a1[a2.ref.base.id])
snpid.index <- snpid.index[c(a1.ref.base.id, a2.ref.base.id)]
## Keep the order of SNPs as in the input file
if(useFile) {
output.index = seq(ncol(sequences))
} else {
output.index = order(snpid.index)
}
return(list(
sequence_matrix= matrix(sequences[, output.index], nrow=2*half.window.size+1),
ref_base = ref.base[output.index],
snp_base = snp.base[output.index],
snpids = snpid.output[output.index],
transition = transition,
prior = prior,
rsid.na = rsid.na,
rsid.rm = rsid.rm,
rsid.duplicate = rsid.duplicate,
rsid.missing = rsid.missing
))
}
#' @name LoadFastaData
#' @title Load the SNP data from fasta files.
#' @description Load SNP data.
#' @param ref.filename a fastq file name for the reference allele sequences.
#' @param snp.filename a fastq file name for the SNP allele sequences.
#'@param ref.urlname URL of a fastq file for the reference allele sequences.
#' @param snp.urlname URL of a fastq file for the SNP allele sequences.
#' @param snpids SNP IDs
#' @param default.par A boolean for whether using the default Markov parameters.
#' Default: FALSE.
#' @return A list object containing the following components:
#' \tabular{ll}{
#' sequence_matrix \tab A list of integer vectors representing the deroxyribose
#' sequence around each SNP.\cr
#' a1 \tab An integer vector for the deroxyribose at the SNP location on the
#' reference genome.\cr
#' a2 \tab An integer vector for the deroxyribose at the SNP location on the SNP
#' genome.\cr
#' }
#' The results are coded as: "A"-1, "C"-2, "G"-3, "T"-4.
#' @author Sunyoung Shin \email{sunyoung.shin@@utdallas.edu}, Chandler Zuo
#' \email{chandler.c.zuo@@gmail.com}
#' @examples LoadFastaData(
#' ref.urlname="http://pages.stat.wisc.edu/~keles/atSNP-Data/sample_1.fasta",
#' snp.urlname="http://pages.stat.wisc.edu/~keles/atSNP-Data/sample_2.fasta")
#' @useDynLib atSNP
#' @import BiocFileCache
#' @import rappdirs
#' @export
LoadFastaData <- function(ref.filename = NULL, snp.filename = NULL,
ref.urlname = NULL, snp.urlname = NULL,
snpids = NULL, default.par = FALSE) {
if ( is.null(ref.filename) & is.null(ref.urlname) ) {
stop("one argument among 'ref.filename' and 'ref.urlname' should be provided.")
} else {
if(!is.null(ref.filename) & !is.null(ref.urlname))
stop("only one argument among 'ref.filename' and 'ref.urlname' should be provided.")
}
if(is.null(ref.filename)) {
bfc <- BiocFileCache(cache = user_cache_dir(appname = "BiocFileCache"), ask = FALSE)
ref.rid <- bfcrid(bfcquery(bfc, query=basename(ref.urlname), exact=TRUE, field="rname"))
if (!length(ref.rid))
ref.rid <- names(bfcadd(bfc, rname=basename(ref.urlname), ref.urlname))
refdat <- read.table(bfcrpath(rids=ref.rid))
} else {
refdat <- read.table(ref.filename)
}
if ( is.null(snp.filename) & is.null(snp.urlname) ) {
stop("one argument among 'snp.filename' and 'snp.urlname' should be provided.")
} else {
if(!is.null(snp.filename) & !is.null(snp.urlname))
stop("only one argument among 'snp.filename' and 'snp.urlname' should be provided.")
}
if(is.null(snp.filename)) {
bfc <- BiocFileCache(cache = user_cache_dir(appname = "BiocFileCache"), ask = FALSE)
snp.rid <- bfcrid(bfcquery(bfc, query=basename(snp.urlname), exact=TRUE, field="rname"))
if (!length(snp.rid))
snp.rid <- names(bfcadd(bfc, rname=basename(snp.urlname), snp.urlname))
snpdat <- read.table(bfcrpath(rids=snp.rid))
} else {
snpdat <- read.table(snp.filename)
}
if(nrow(refdat) != nrow(snpdat)) {
stop("'ref.data' and 'snp.data' should have the same number of rows.")
}
n <- nrow(refdat)
if(n%%2==1) {
stop("'ref.data' and 'snp.data' should have an even number of rows.")
}
ids <- 2 * seq(n / 2)
refseqs <- as.character(refdat[ids, 1])
snpseqs <- as.character(snpdat[ids, 1])
if(is.null(snpids)) {
snpids<-gsub(">", "", as.character(refdat[ids-1,1]))
}
if(var(sapply(refseqs, nchar)) != 0) {
stop("sequences in 'ref.data' have different lengths.")
}
if(var(sapply(snpseqs, nchar)) != 0) {
stop("sequences in 'snp.data' have different lengths.")
}
codes <- seq(4)
names(codes) <- c("A", "C", "G", "T")
refmat <- sapply(refseqs,
function(x)
codes[strsplit(x, "")[[1]]])
snpmat <- sapply(snpseqs,
function(x)
codes[strsplit(x, "")[[1]]])
colnames(refmat) <- colnames(snpmat) <- rownames(refmat) <- rownames(snpmat) <- NULL
m <- nrow(refmat)
if(nrow(refmat) != nrow(snpmat)) {
stop("the sequences for the SNP alleles and the reference alleles have different lengths.")
}
if(m %% 2 == 0) {
stop("each sequence must have an odd number of length.")
}
id.na1 <- which(apply(refmat, 2, function(x) sum(is.na(x))) > 0)
id.na2 <- which(is.na(snpmat[(m + 1) / 2, ]))
id.na <- union(id.na1, id.na2)
if(length(id.na) > 0) {
message("Warning: the following sequences are discarded, because they include bases other than A, C, G, T: ")
message(paste(id.na, collapse = ", "))
}
id.wrong <- which(apply((refmat != snpmat)[-(m + 1) / 2, ], 2, sum) > 0)
if(length(id.wrong) > 0) {
message("Warning: the following sequences are discarded, because they have unidentical nucleotides between the SNP and the reference allele at positions other than the central location: ")
message(paste(id.wrong, collapse = ", "))
}
ids <- union(id.wrong, id.na)
if(length(ids) > 0) {
sequences <- refmat[, -ids, drop = FALSE]
ref.base <- refmat[(m + 1) / 2, -ids]
snp.base <- snpmat[(m + 1) / 2, -ids]
} else {
sequences <- refmat
ref.base <- refmat[(m + 1) / 2, ]
snp.base <- snpmat[(m + 1) / 2, ]
}
if(!default.par) {
transition <- .Call("transition_matrix", sequences, package = "atSNP")
prior <- rowSums(transition)
prior <- prior / sum(prior)
transition <- transition / rowSums(transition)
names(prior) <- colnames(transition) <- rownames(transition) <- c("A", "C", "G", "T")
} else {
data(default_par, envir = environment())
}
colnames(sequences)<-snpids
return(list(
sequence_matrix= sequences,
ref_base = ref.base,
snp_base = snp.base,
transition = transition,
prior = prior, snpids = snpids
))
}
#' @name ComputeMotifScore
#' @title Compute the scores for SNP effects on motifs.
#' @description Compute the log-likelihood scores for motifs.
#' @param motif.lib A list object with the output format of function
#' \code{\link{LoadMotifLibrary}}.
#' @param snp.info A list object with the output format of function
#' \code{\link{LoadSNPData}}.
#' @param ncores An integer for the number of parallel process. Default: 1.
#' @details This function computes the binding affinity scores for both alleles
#' at each SNP window. For each pair of SNP and motif, it finds the subsequence
#' from both strand that maximizes the affinity binding score. It returns both
#' the matching positions and the maximized affinity scores.
#' @return A list of two data.frame's. Field \code{snp.tbl} contains:
#' \tabular{cc}{
#' snpid \tab SNP id.\cr
#' ref_seq \tab Reference allele nucleotide sequence.\cr
#' snp_seq \tab SNP allele nucleotide sequence.\cr
#' ref_seq_rev \tab Reference allele nucleotide sequence on the reverse
#' strand.\cr
#' snp_seq_rev \tab SNP allele nucleotide sequence on the reverse strand.\cr}
#' Field \code{motif.score} contains:
#' \tabular{cc}{
#' motif \tab Name of the motif.\cr
#' motif_len \tab Length of the motif.\cr
#' ref_start, ref_end, ref_strand \tab Location of the best matching subsequence
#' on the reference allele.\cr
#' snp_start, snp_end, snp_strand \tab Location of the best matching subsequence
#' on the SNP allele.\cr
#' log_lik_ref \tab Log-likelihood score for the reference allele.\cr
#' log_lik_snp \tab Log-likelihood score for the SNP allele.\cr
#' log_lik_ratio \tab The log-likelihood ratio.\cr
#' log_enhance_odds \tab Difference in log-likelihood ratio between SNP allele
#' and reference allele based on the best matching subsequence on the reference
#' allele.\cr
#' log_reduce_odds \tab Difference in log-likelihood ratio between reference
#' allele and SNP allele based on the best matching subsequence on the SNP
#' allele.\cr
#' }
#' @author Sunyoung Shin \email{sunyoung.shin@@utdallas.edu}, Chandler Zuo
#' \email{chandler.c.zuo@@gmail.com}
#' @examples
#' data(example)
#' ComputeMotifScore(motif_library, snpInfo, ncores = 2)
#' @useDynLib atSNP
#' @import data.table
#' @importFrom BiocParallel bpmapply MulticoreParam SnowParam
#' @export
ComputeMotifScore <- function(motif.lib, snp.info, ncores = 1) {
## check arguments
if(sum(!unlist(sapply(motif.lib, is.matrix))) > 0 | sum(unlist(sapply(motif.lib, ncol)) != 4) > 0) {
stop("'motif.lib' must be a list of numeric matrices each with 4 columns.")
}
if(sum(!c("sequence_matrix", "snp_base", "ref_base") %in% names(snp.info)) > 0) {
stop("'snp.info' must contain three components: 'ref_base', 'snp_base', 'sequence_matrix'.")
}
if(ncol(snp.info$sequence_matrix) != length(snp.info$ref_base) | length(snp.info$ref_base) != length(snp.info$snp_base)) {
stop("the number of columns of 'snp.info$sequence_matrix', the length of 'snp.info$ref_base' and the length of 'snp.info$snp_base' must be the same.")
}
if( ! all( sort(unique(c(c(snp.info$sequence_matrix), snp.info$ref_base, snp.info$snp_base))) %in% seq(4) )) {
stop("'snp.info$sequence_matrix', 'snp.info$ref_base', 'snp.info$snp_base' can only contain entries in 1, 2, 3, 4.")
}
if(nrow(snp.info$sequence_matrix) / 2 == as.integer(nrow(snp.info$sequence_matrix) / 2)) {
stop("'snp.info$sequence_matrix' must have an odd number of rows so that the central row refers to the SNP nucleotide.")
}
motifs <- names(motif.lib)
snpids <- snp.info$snpids
snpbases<-ifelse(snp.info$snp_base==1, "A", ifelse(snp.info$snp_base==2, "C", ifelse(snp.info$snp_base==3, "G", "T")))
nmotifs <- length(motif.lib)
len_seq <- nrow(snp.info$sequence_matrix)
snp.info$sequence_matrix[(len_seq+1)/2,]<-snp.info$ref_base
ncores <- min(c(ncores, length(snp.info$ref_base)))
k <- as.integer(length(snp.info$ref_base) / ncores)
if(ncores > 1) {
if(Sys.info()[["sysname"]] == "Windows"){
snow <- SnowParam(workers = ncores, type = "SOCK")
motif_score_par_list <- bpmapply(function(x) motif_score_par(i=x, par.k=k, par.ncores=ncores, par.motifs=motifs, par.nmotifs=nmotifs, par.snpids=snpids, par.snpbases=snpbases, par.len_seq=len_seq, par.motif.lib=motif.lib, par.snp.info=snp.info), seq(ncores), BPPARAM = snow,SIMPLIFY = FALSE)
} else {
motif_score_par_list <- bpmapply(function(x) motif_score_par(i=x, par.k=k, par.ncores=ncores, par.motifs=motifs, par.nmotifs=nmotifs, par.snpids=snpids, par.snpbases=snpbases, par.len_seq=len_seq, par.motif.lib=motif.lib, par.snp.info=snp.info), seq(ncores), BPPARAM = MulticoreParam(workers = ncores),
SIMPLIFY = FALSE)
}
}
else {
motif_score_par_list <- list(motif_score_par(i=1, par.k=k, par.ncores=ncores, par.motifs=motifs, par.nmotifs=nmotifs,
par.snpids=snpids, par.snpbases=snpbases, par.len_seq=len_seq,
par.motif.lib=motif.lib, par.snp.info=snp.info))
}
motif.scores_dt <- motif_score_par_list[[1]]
if(ncores > 1) {
for(i in 2:ncores) {
motif.scores_dt <- rbind(motif.scores_dt, motif_score_par_list[[i]])
}
}
# setkey(motif.scores_dt, motif, snpid, snpbase)
motif.scores<-as.data.frame(motif.scores_dt, stringAsFactors=FALSE)
motif.scores<-motif.scores[order(motif.scores$motif, motif.scores$snpid, motif.scores$snpbase), ]
# motif.scores[with(motif.scores, order(motif, snpid, snpbase)), ]
## sequences on the reference genome
ref_seqs <- apply(snp.info$sequence_matrix, 2, function(x) paste(c("A", "C", "G", "T")[x], collapse = ""))
ref_seqs_rev <- apply(snp.info$sequence_matrix, 2, function(x) paste(c("A", "C", "G", "T")[5 - rev(x)], collapse = ""))
## sequences on the snp allele
id1 <- seq(as.integer(nrow(snp.info$sequence_matrix) / 2))
id2 <- id1 + (nrow(snp.info$sequence_matrix) + 1) / 2
snp_seqs <- apply(rbind(snp.info$sequence_matrix, snp.info$snp_base), 2,
function(x)
paste(c("A", "C", "G", "T")[x[c(id1, length(x), id2)]],
collapse = "")
)
snp_seqs_rev <- apply(rbind(snp.info$sequence_matrix, snp.info$snp_base), 2,
function(x)
paste(c("A", "C", "G", "T")[5 - rev(x[c(id1, length(x), id2)])],
collapse = "")
)
# snp_tbl <- data.table(snpid = snpids,
# ref_seq = ref_seqs,
# snp_seq = snp_seqs,
# ref_seq_rev = ref_seqs_rev,
# snp_seq_rev = snp_seqs_rev,
# snpbase=snpbases)
# setkey(snp_tbl, snpid, snpbase)
snp_tbl <- data.frame(snpid = snpids, ref_seq = ref_seqs, snp_seq = snp_seqs, ref_seq_rev = ref_seqs_rev, snp_seq_rev = snp_seqs_rev, snpbase = snpbases, stringsAsFactors=FALSE)
snp_tbl[with(snp_tbl, order(snpid, snpbase)),]
return(list(snp.tbl = snp_tbl, motif.scores = motif.scores))
}
#' @name MatchSubsequence
#' @title Compute the matching subsequence.
#' @description This function combines the SNP set, the motif library and the
#' affinity score table and produce the matching subsequence found at each SNP
#' location for each motif.
#' @param snp.tbl A data.frame with the following information:
#' \tabular{cc}{
#' snpid \tab SNP id.\cr
#' ref_seq \tab Reference allele nucleotide sequence.\cr
#' snp_seq \tab SNP allele nucleotide sequence.\cr
#' ref_seq_rev \tab Reference allele nucleotide sequence on the reverse
#' strand.\cr
#' snp_seq_rev \tab SNP allele nucleotide sequence on the reverse strand.\cr}
#' @param motif.scores A data.frame with the following information:
#' \tabular{cc}{
#' motif \tab Name of the motif.\cr
#' motif_len \tab Length of the motif.\cr
#' ref_start, ref_end, ref_strand \tab Location of the best matching subsequence
#' on the reference allele.\cr
#' snp_start, snp_end, snp_strand \tab Location of the best matching subsequence
#' on the SNP allele.\cr
#' log_lik_ref \tab Log-likelihood score for the reference allele.\cr
#' log_lik_snp \tab Log-likelihood score for the SNP allele.\cr
#' log_lik_ratio \tab The log-likelihood ratio.\cr
#' log_enhance_odds \tab Difference in log-likelihood ratio between SNP allele
#' and reference allele based on the best matching subsequence on the reference
#' allele.\cr
#' log_reduce_odds \tab Difference in log-likelihood ratio between reference
#' allele and SNP allele based on the best matching subsequence on the SNP
#' allele.\cr
#' }
#' @param motif.lib A list of the position weight matrices for the motifs.
#' @param snpids A subset of snpids to compute the subsequences. Default: NULL,
#' when all snps are computed.
#' @param motifs A subset of motifs to compute the subsequences. Default: NULL,
#' when all motifs are computed.
#' @param ncores The number of cores used for parallel computing.
#' @return A data.frame containing all columns in both \code{snp.tbl} and
#' \code{motif.scores}. In addition, the following columns are added:
#' \tabular{ll}{
#' ref_match_seq \tab Best matching subsequence on the reference allele.\cr
#' snp_match_seq \tab Best matching subsequence on the SNP allele.\cr
#' ref_seq_snp_match \tab Subsequence on the reference allele corresponding to
#' the best matching location on the SNP allele.\cr
#' snp_seq_ref_match \tab Subsequence on the SNP allele corresponding to the
#' best matching location on the reference allele.\cr
#' }
#' @author Sunyoung Shin \email{sunyoung.shin@@utdallas.edu}, Chandler Zuo
#' \email{chandler.c.zuo@@gmail.com}
#' @examples
#' data(example)
#' MatchSubsequence(motif_scores$snp.tbl, motif_scores$motif.scores,
#' motif_library, ncores=2)
#' @useDynLib atSNP
#' @import data.table
#' @importFrom BiocParallel bpmapply MulticoreParam SnowParam
#' @export
MatchSubsequence <- function(snp.tbl, motif.scores, motif.lib, snpids = NULL, motifs = NULL, ncores = 1) {
if(is.null(snpids)) {
snpids <- unique(snp.tbl$snpid)
}
if(is.null(motifs)) {
motifs <- unique(motif.scores$motif)
}
if(sum(! motifs %in% names(motif.lib)) > 0) {
motif.discard <- setdiff(motifs, unique(names(motif.lib)))
message("Warning: the following motifs are not included in 'motif.lib' and are discarded: ")
message(paste(motif.discard, collapse = ", "))
motifs <- setdiff(motifs, motif.discard)
}
if(sum(! snpids %in% motif.scores$snpid) > 0) {
snp.discard <- setdiff(snpids, unique(motif.scores$snpid))
message("Warning: the following snpids are not included in 'motif.scores' and are discarded: ")
message(paste(snp.discard, collapse = ", "))
snpids <- setdiff(snpids, snp.discard)
}
snpids <- unique(snpids)
motifs <- unique(motifs)
motif.scores_dt<-as.data.table(motif.scores)
setkey(motif.scores_dt, motif, snpid, snpbase)
snp.tbl <- as.data.table(snp.tbl)
setkey(snp.tbl, snpid, snpbase)
motif.scores <- motif.scores_dt[snpid %in% snpids & motif %in% motifs, ]
snp.tbl <- snp.tbl[snpid %in% snpids, ]
snp.tbl[, len_seq := nchar(ref_seq)]
## get the IUPAC subsequence for the motifs
motif.tbl <- data.table(
motif = motifs,
IUPAC = sapply(motif.lib[motifs],
function(x) GetIUPACSequence(x, prob = 0.25))
)
setkey(motif.tbl, motif)
setkey(snp.tbl, snpid, snpbase)
ncores <- min(c(ncores, length(snpids)))
k <- as.integer(length(snpids) / ncores)
if(ncores > 1) {
if(Sys.info()[["sysname"]] == "Windows"){
snow <- SnowParam(workers = ncores, type = "SOCK")
motif_score_par_list<-bpmapply(function(x) match_subseq_par(i=x, par.k=k, par.ncores=ncores, par.snp.tbl=snp.tbl, par.snpids=snpids, par.motif.scores=motif.scores, par.motif=motif, par.motif.tbl=motif.tbl), seq(ncores), BPPARAM = snow,SIMPLIFY = FALSE)
}else{
motif_score_par_list<-bpmapply(function(x) match_subseq_par(i=x, par.k=k, par.ncores=ncores, par.snp.tbl=snp.tbl, par.snpids=snpids, par.motif.scores=motif.scores, par.motif=motif, par.motif.tbl=motif.tbl), seq(ncores), BPPARAM = MulticoreParam(workers = ncores),
SIMPLIFY = FALSE)
}
} else
{
motif_score_par_list<-list(match_subseq_par(1, par.k=k, par.ncores=ncores, par.snp.tbl=snp.tbl,
par.snpids=snpids, par.motif.scores=motif.scores,
par.motif=motif, par.motif.tbl=motif.tbl))
}
motif_score_tbl <- motif_score_par_list[[1]]
if(ncores > 1) {
for(i in 2:ncores) {
motif_score_tbl <- rbind(motif_score_tbl,
motif_score_par_list[[i]])
}
}
motif_score_tbl<-as.data.frame(motif_score_tbl, stringAsFacotrs=FALSE)
return(motif_score_tbl)
}
#' @name ComputePValues
#' @title Compute p-values for affinity scores.
#' @description This function computes the p-values for allele-specific affinity
#' scores and between-allele affinity score changes using the importance
#' sampling technique.
#' @param motif.lib A list object with the output format of function
#' \code{\link{LoadMotifLibrary}}.
#' @param snp.info A list object with the output format of function
#' \code{\link{LoadSNPData}}.
#' @param motif.scores A data.frame object containing at least the following
#' columns:
#' \tabular{ll}{
#' motif \tab The name of the motif.\cr
#' log_lik_ref \tab The log-likelihood score for the reference allele.\cr
#' log_lik_snp \tab The log-likelihood score for the SNP allele.\cr
#' }
#' @param ncores An integer for the number of parallel process. Default: 1.
#' @param testing.mc Monte Carlo sample size of 200 is considered. Do not
#' change the default unless conducting a quick test. Default: FALSE
#' @param figdir A string for the path to print p-value plots for monitoring
#' results. Default: NULL (no figure).
#' @return A data.frame extending \code{motif.scores} by the following
#' additional columns:
#' \tabular{ll}{
#' pval_ref \tab P-values for scores on the reference allele.\cr
#' pval_snp \tab P-values for scores on the SNP allele.\cr
#' pval_cond_ref \tab Conditional p-values for scores on the reference
#' allele.\cr
#' pval_cond_snp \tab Conditional p-values for scores on the SNP allele.\cr
#' pval_diff \tab P-values for the difference in scores between the reference
#' and the SNP alleles.\cr
#' pval_rank \tab P-values for the log rank ratio between the reference and the
#' SNP alleles.\cr
#' }
#' @author Sunyoung Shin \email{sunyoung.shin@@utdallas.edu}, Chandler Zuo
#' \email{chandler.c.zuo@@gmail.com}
#' @examples
#' data(example)
#' ComputePValues(motif_library, snpInfo, motif_scores$motif.scores, ncores = 2, testing.mc=TRUE)
#' @import Rcpp
#' @import data.table
#' @import ggplot2
#' @importFrom BiocParallel bpmapply MulticoreParam SnowParam
#' @useDynLib atSNP
#' @export
ComputePValues <- function(motif.lib, snp.info, motif.scores, ncores = 1, testing.mc=FALSE, figdir = NULL) {
ncores <- min(c(ncores, length(motif.lib)))
results <- as.list(seq_along(motif.lib))
nsets <- as.integer(length(motif.lib) / ncores)
motif.scores <- as.data.table(motif.scores)
prior <- snp.info$prior
transition <- snp.info$transition
if(Sys.info()[["sysname"]] == "Windows"){
snow <- SnowParam(workers = ncores, type = "SOCK")
results<-bpmapply(function(x) results_motif_par(motif.id=x, par.prior=prior, par.transition=transition, par.motif.lib=motif.lib, par.motif.scores=motif.scores, par.testing.mc=testing.mc, par.figdir=figdir), seq_along(motif.lib), BPPARAM = snow,SIMPLIFY = FALSE)
}else{
results<-bpmapply(function(x) results_motif_par(motif.id=x, par.prior=prior, par.transition=transition, par.motif.lib=motif.lib, par.motif.scores=motif.scores, par.testing.mc=testing.mc, par.figdir=figdir), seq_along(motif.lib), BPPARAM = MulticoreParam(workers = ncores),
SIMPLIFY = FALSE)
}
motif.scores_dt<-as.data.table(motif.scores)
setkey(motif.scores_dt, motif, snpid, snpbase)
for(i in seq_along(results)) {
motif.scores_dt[results[[i]]$rowids, pval_ref := results[[i]]$pval_a[, 1]]
motif.scores_dt[results[[i]]$rowids, pval_snp := results[[i]]$pval_a[, 2]]
motif.scores_dt[results[[i]]$rowids, pval_cond_ref := results[[i]]$pval_cond[, 1]]
motif.scores_dt[results[[i]]$rowids, pval_cond_snp := results[[i]]$pval_cond[, 2]]
motif.scores_dt[results[[i]]$rowids, pval_diff := results[[i]]$pval_diff[, 1]]
motif.scores_dt[results[[i]]$rowids, pval_rank := results[[i]]$pval_rank[, 1]]
}
motif.scores.pval<-as.data.frame(motif.scores_dt, stringsAsFactors=FALSE)
return(motif.scores.pval)
}
#' @name GetIUPACSequence
#' @title Get the IUPAC sequence of a motif.
#' @description Convert the posotion weight matrix of a motif to the IUPAC
#' sequence.
#' @param pwm The position weight matrix, with the columns representing
#' A, C, G, T.
#' @param prob The probability threshold. Default: 0.25.
#' @return A character string.
#' @author Sunyoung Shin \email{sunyoung.shin@@utdallas.edu}, Chandler Zuo
#' \email{chandler.c.zuo@@gmail.com}
#' @examples
#' data(example)
#' GetIUPACSequence(motif_library[[1]], prob = 0.2)
#' @export
GetIUPACSequence <- function(pwm, prob = 0.25) {
iupac.table <-
c(".", "A", "C", "M", "G", "R", "S", "V", "T", "W", "Y", "H", "K", "D", "B", "N")
iupac.value <- (pwm >= prob) %*% c(1, 2, 4, 8) + 1
return(paste(iupac.table[iupac.value], collapse = ""))
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.