#' Read in VCF file
#'
#' Read in a VCF file as a \link[VariantAnnotation]{VCF} or a
#' \link[data.table]{data.table}.
#' Can optionally save the VCF/data.table as well.
#'
#' @param path Path to local or remote VCF file.
#' @param as_datatable Return the data as a
#' \link[data.table]{data.table} (default: \code{TRUE})
#' or a \link[VariantAnnotation]{VCF} (\code{FALSE}).
#' @param samples Which samples to use:
#' \itemize{
#' \item{1 : }{Only the first sample will be used (\emph{DEFAULT}).}
#' \item{NULL : }{All samples will be used.}
#' \item{c("<sample_id1>","<sample_id2>",...) : }{
#' Only user-selected samples will be used (case-insensitive).}
#' }
#' @param which Genomic ranges to be added if supplied. Default is NULL.
#' @param use_params
#' When \code{TRUE} (default), increases the speed of reading in the VCF by
#' omitting columns that are empty based on the head of the VCF (NAs only).
#' NOTE that that this requires the VCF to be sorted, bgzip-compressed,
#' tabix-indexed, which \link[MungeSumstats]{read_vcf} will attempt to do.
#' @param download Download the VCF (and its index file)
#' to a temp folder before reading it into R.
#' This is important to keep \code{TRUE} when \code{nThread>1} to avoid
#' making too many queries to remote file.
#' @param mt_thresh When the number of rows (variants) in the VCF is
#' \code{< mt_thresh}, only use single-threading for reading in the VCF.
#' This is because the overhead of parallelisation outweighs the speed benefits
#' when VCFs are small.
#' @param verbose Print messages.
#' @inheritParams check_empty_cols
#' @inheritParams format_sumstats
#' @inheritParams downloader
#' @inheritParams download_vcf
#' @inheritParams import_sumstats
#' @inheritParams VariantAnnotation::ScanVcfParam
#'
#' @return The VCF file in data.table format.
#' @export
#' @importFrom VariantAnnotation writeVcf
#' @importFrom data.table as.data.table setnames fwrite
#' @source
#' \code{
#' #### Benchmarking ####
#' library(VCFWrenchR)
#' library(VariantAnnotation)
#' path <- "https://gwas.mrcieu.ac.uk/files/ubm-a-2929/ubm-a-2929.vcf.gz"
#' vcf <- VariantAnnotation::readVcf(file = path)
#' N <- 1e5
#' vcf_sub <- vcf[1:N,]
#' res <- microbenchmark::microbenchmark(
#' "vcf2df"={dat1 <- MungeSumstats:::vcf2df(vcf = vcf_sub)},
#' "VCFWrenchR"= {dat2 <- as.data.frame(x = vcf_sub)},
#' "VRanges"={dat3 <- data.table::as.data.table(
#' methods::as(vcf_sub, "VRanges"))},
#' times=1
#' )
#' }
#' @source \href{https://github.com/Bioconductor/VariantAnnotation/issues/57}{
#' Discussion on VariantAnnotation GitHub}
#' @source \href{https://github.com/Bioconductor/VariantAnnotation/issues/59}{
#' Discussion on VariantAnnotation GitHub}
#' @examples
#' #### Local file ####
#' path <- system.file("extdata","ALSvcf.vcf", package="MungeSumstats")
#' sumstats_dt <- read_vcf(path = path)
#'
#' #### Remote file ####
#' ## Small GWAS (0.2Mb)
#' # path <- "https://gwas.mrcieu.ac.uk/files/ieu-a-298/ieu-a-298.vcf.gz"
#' # sumstats_dt2 <- read_vcf(path = path)
#'
#' ## Large GWAS (250Mb)
#' # path <- "https://gwas.mrcieu.ac.uk/files/ubm-a-2929/ubm-a-2929.vcf.gz"
#' # sumstats_dt3 <- read_vcf(path = path, nThread=11)
#'
#' ### Very large GWAS (500Mb)
#' # path <- "https://gwas.mrcieu.ac.uk/files/ieu-a-1124/ieu-a-1124.vcf.gz"
#' # sumstats_dt4 <- read_vcf(path = path, nThread=11)
read_vcf <- function(path,
as_datatable = TRUE,
save_path = NULL,
tabix_index = FALSE,
samples = 1,
which = NULL,
use_params = TRUE,
sampled_rows = 1e4L,
download = TRUE,
vcf_dir = tempdir(),
download_method = "download.file",
force_new = FALSE,
mt_thresh = 1e5L,
nThread = 1,
verbose = TRUE) {
#### Read VCF ####
## Returns either a VCF or a data.table, depending on as_datatable arg.
vcf_dt <- read_vcf_parallel(path = path,
samples = samples,
which = which,
use_params = use_params,
as_datatable = as_datatable,
sampled_rows = sampled_rows,
download = download,
vcf_dir = vcf_dir,
download_method = download_method,
force_new = force_new,
nThread = nThread,
verbose = verbose)
#### Check save path ####
if (!is.null(save_path)) {
check_save_path_out <- check_save_path(
### dummy arg, not used here
log_folder = "NULL",
### dummy arg, not used here
log_folder_ind = FALSE,
save_path = save_path,
write_vcf = !as_datatable,
tabix_index = tabix_index)
save_path <- check_save_path_out$save_path
}
#### Save as VCF ####
if (isFALSE(as_datatable)) {
if (!is.null(save_path)) {
VariantAnnotation::writeVcf(
obj = vcf_dt,
filename = check_save_path_out$save_path,
index = tabix_index
)
}
messager("Returning as VCF.")
return(vcf_dt)
}
#### Prepare SNP column ####
read_vcf_markername(sumstats_dt = vcf_dt)
#### Prepare/un-log P col ####
sumstats_dt <- read_log_pval(sumstats_dt = vcf_dt,
return_list=FALSE)
#### Prepare INFO col ####
read_vcf_info(sumstats_dt = vcf_dt)
#### Rename start col #####
data.table::setnames(vcf_dt, "start", "BP")
#### Write new data ####
if (!is.null(save_path)) {
messager("Storing intermediate file before proceeding ==>", save_path,
v = verbose)
data.table::fwrite(
x = vcf_dt,
file = save_path,
sep = "\t",
nThread = nThread,
na = "NA",
quote = FALSE
)
}
return(vcf_dt)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.