Nothing
#' @title Annotate GWAS SNPs
#'
#' @description The function annotateSNPsGWAS() annotates GWAS SNPs located
#' in the region flanking the back-spliced junctions of each circRNA.
#' SNPs information including the corresponding genomic coordinates are
#' retrieved from the GWAs catalog database.
#' The user can restric the analysis to specific traits/diseases. These must go
#' in the file traits.txt. If this file is absent or empty, all traits in the
#' GWAS catalog are considered in the analysis.
#' An empty list is returned if none overlapping SNPs are found.
#'
#' @param targets A list containing the target regions to analyze.
#' It can be generated with \code{\link{getSeqsFromGRs}}.
#'
#' @param assembly A string specifying the human genome assembly.
#' Possible options are hg38 or hg19. Current image for GWAS SNPS coordinates
#' is hg38. If hg19 is specified SNPs coordinates are realtime liftOver to
#' hg19 coordinates.
#'
#' @param makeCurrent A logical specifying whether to download the
#' current image of the GWAS catalog. If TRUE is specified, the function
#' \code{\link[gwascat]{makeCurrentGwascat}} from the gwascat package is used
#' to get the more recent image (slow). Default value is TRUE.
#' If FALSE is specified the image data(ebicat37) or data(ebicat38) are used.
#' NOTE: This second option is not available anymore since data(ebicat37) or
#' data(ebicat38) from gwascat are deprecated.
#'
#' @param pathToTraits A string containing the path to the traits.txt file.
#' The file traits.txt contains diseases/traits specified by the user. It must
#' have one column with header id. By default pathToTraits is set to NULL and
#' the file it is searched in the working directory. If traits.txt is located
#' in a different directory then the path needs to be specified. If this file is
#' absent or empty SNPs associated with all diseases/traits in
#' the GWAS catalog are considered in the SNPs analysis.
#'
#' @return A list.
#'
#' @examples
#' # Load data frame containing detected back-spliced junctions
#' data("mergedBSJunctions")
#'
#' # Load short version of the gencode v19 annotation file
#' data("gtf")
#'
#' # Annotate the first back-spliced junctions
#' annotatedBSJs <- annotateBSJs(mergedBSJunctions[1, ], gtf)
#'
#' # Get genome
#' if (requireNamespace("BSgenome.Hsapiens.UCSC.hg19", quietly = TRUE)){
#' genome <- BSgenome::getBSgenome("BSgenome.Hsapiens.UCSC.hg19")
#'
#' # Retrieve targets
#' targets <- getSeqsFromGRs(
#' annotatedBSJs,
#' genome,
#' lIntron = 200,
#' lExon = 10,
#' type = "ie"
#' )
#'
#' # Annotate GWAS SNPs - slow
#'
#' #snpsGWAS <- annotateSNPsGWAS(targets, makeCurrent = TRUE, assembly = "hg19")
#'
#' }
#'
#'
#' @import dplyr
#' @importFrom gwascat makeCurrentGwascat
#' @importFrom gwascat subsetByTraits
#' @importFrom GenomicRanges makeGRangesFromDataFrame
#' @importFrom GenomicRanges findOverlaps
#' @importFrom magrittr %>%
#' @importFrom utils read.table
#' @importFrom S4Vectors subjectHits
#' @importFrom S4Vectors queryHits
#' @importFrom rlang .data
#' @importFrom GenomeInfoDb seqlevelsStyle
#' @importFrom utils data
#' @export
annotateSNPsGWAS <-
function(targets,
assembly = "hg19",
makeCurrent = TRUE,
pathToTraits = NULL) {
options(readr.num_columns = 0)
# Create a snpsGWAS list
snpsGWAS <- .createSNPsGWASlist(targets)
# Retrieve SNPs from the GWAS catalog
gwas <- .getGWAS(assembly, makeCurrent = TRUE)
# Read traits.txt
traitsFromFile <- .readTraits(pathToTraits)
# Check if there there are traits
if (nrow(traitsFromFile) > 0) {
gwas <- gwascat::subsetByTraits(gwas, tr = traitsFromFile$id)
} else{
cat("traits.txt is empty or absent. All
traits in the GWAS catalog will be analyzed")
}
# Clean targets from NA value
targets <- .cleanTargets(targets)
for (i in seq_along(snpsGWAS)) {
# Create an empty list of 2 elements to store the extracted
# information
snpsGWAS[[i]] <- vector("list", 2)
names(snpsGWAS[[i]])[1] <- "targets"
names(snpsGWAS[[i]])[2] <- "snps"
targetsToAnalyze <- targets[[i]]
overlaps <- .findOverlappingSNPs(gwas, targetsToAnalyze)
snpsGWAS[[i]]$targets <- overlaps$targets
snpsGWAS[[i]]$snps <- overlaps$snps
}
return(snpsGWAS)
}
# Get GWAS SNPs
.getGWAS <- function(assembly = "hg19",
makeCurrent = TRUE) {
if (assembly == "hg19") {
if (makeCurrent) {
gwas <- suppressWarnings(tryCatch(
gwascat::makeCurrentGwascat(fixNonASCII = FALSE, genome = "GRCh37"),
error = function(e)
NULL
))
} else{
gwas <- NULL
}
if (!is.null(gwas)) {
GenomeInfoDb::seqlevelsStyle(gwas) <- "UCSC"
} else{
stop(
"makeCurrentGwascat is unable to establish a connection with EBI/EMBL.
Try later."
)
#Load lifted over image.
# utils::data(ebicat37) # Note: deprecated
# gwas <- ebicat37
# GenomeInfoDb::seqlevelsStyle(gwas) <- "UCSC"
}
} else if (assembly == "hg38") {
if (makeCurrent) {
gwas <- suppressWarnings(tryCatch(
gwascat::makeCurrentGwascat(fixNonASCII = FALSE, genome = "GRCh38"),
error = function(e)
NULL
))
} else{
gwas <- NULL
}
if (!is.null(gwas)) {
GenomeInfoDb::seqlevelsStyle(gwas) <- "UCSC"
} else{
stop(
"makeCurrentGwascat is unable to establish a connection with EBI/EMBL.
Try later."
)
# Load image dated 9 Sept 2020
# utils::data(ebicat38) # Note: deprecated
# gwas <- ebicat38
# GenomeInfoDb::seqlevelsStyle(gwas) <- "UCSC"
}
} else {
stop("Possible genome assembly: hg19, hg38")
}
return(gwas)
}
# Create a snpsGWAS list
.createSNPsGWASlist <- function(targets) {
if (length(targets) == 2 &
names(targets)[[1]] == "upGR") {
# Create an empty list of 2 elements
snpsGWAS <- vector("list", 2)
names(snpsGWAS)[1] <- "upGR"
names(snpsGWAS)[2] <- "downGR"
} else {
stop("target sequences not valid, only upstream and downtream
GRs are allowed.")
}
return(snpsGWAS)
}
# get SNPsGWAS column names
.getSNPsGWASColNames <- function() {
colNames <- c(
"id",
"snp",
"chrom",
"coord",
"mappedGene",
"diseaseTrait",
"pvalue",
"context",
"strongestSNPriskAllele",
"pubmedID",
"study"
)
return(colNames)
}
# Select the needed column and rename snps data frame
.renameSNPsGWAS <- function(snps) {
snps <- snps %>%
dplyr::select(
.data$id,
.data$SNPS,
.data$seqnames.1,
.data$start.1,
.data$MAPPED_GENE,
.data$DISEASE.TRAIT,
.data$P.VALUE,
.data$CONTEXT,
.data$STRONGEST.SNP.RISK.ALLELE,
.data$PUBMEDID,
.data$STUDY
) %>%
dplyr::rename(
snp = .data$SNPS,
chrom = .data$seqnames.1 ,
coord = .data$start.1,
mappedGene = .data$MAPPED_GENE,
diseaseTrait = .data$DISEASE.TRAIT,
pvalue = .data$P.VALUE,
context = .data$CONTEXT,
strongestSNPriskAllele =
.data$STRONGEST.SNP.RISK.ALLELE,
pubmedID = .data$PUBMEDID,
study = .data$STUDY
) %>%
dplyr::arrange(.data$id)
return(snps)
}
# Find the overlapping gwas snps
.findOverlappingSNPs <- function(gwas, targetsToAnalyze) {
# Make GR objects
genRanges <- GenomicRanges::makeGRangesFromDataFrame(
targetsToAnalyze,
keep.extra.columns = TRUE,
ignore.strand = FALSE,
seqinfo = NULL,
seqnames.field = c("chrom"),
start.field = c("startGR"),
end.field = c("endGR"),
strand.field = "strand",
starts.in.df.are.0based = FALSE
)
overlappingGRs <-
suppressWarnings(GenomicRanges::findOverlaps(gwas, genRanges, ignore.strand = TRUE))
if (length(overlappingGRs) == 0) {
# No genomic ranges in common
snps <- data.frame(matrix(nrow = 0, ncol = 11))
colnames(snps) <- .getSNPsGWASColNames()
targets <- targetsToAnalyze[NULL,]
} else{
snps <-
data.frame(genRanges[S4Vectors::subjectHits(overlappingGRs)],
gwas[S4Vectors::queryHits(overlappingGRs)])
snps <- .renameSNPsGWAS(snps)
# Keep only targets where a hit is found
targets <-
targetsToAnalyze[S4Vectors::subjectHits(overlappingGRs),] %>%
dplyr::filter(!duplicated(.)) %>%
dplyr::arrange(.data$id)
}
overlaps <- vector("list", 2)
names(overlaps)[1] <- "snps"
names(overlaps)[2] <- "targets"
overlaps$snps <- snps
overlaps$targets <- targets
return(overlaps)
}
# If the function you are looking for is not here check supportFunction.R
# Functions in supportFunction.R are used by multiple functions.
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.