#' Genome build liftover
#'
#' Transfer genomic coordinates from one genome build to another.
#'
#' @source \href{https://doi.org/doi:10.18129/B9.bioc.liftOver}{liftOver}
#' @source \href{https://hgdownload.cse.ucsc.edu/goldenpath/hg19/liftOver/}{
#' UCSC chain files}
#' @source \href{https://ftp.ensembl.org/pub/assembly_mapping/homo_sapiens/}{
#' Ensembl chain files}
#'
#' @param sumstats_dt data table obj of the summary statistics
#' file for the GWAS.
#' @param chain_source chain file source used ("ucsc" as default, or "ensembl")
#' @param chrom_col Name of the chromosome column in
#' \code{sumstats_dt} (e.g. "CHR").
#' @param start_col Name of the starting genomic position
#' column in \code{sumstats_dt} (e.g. "POS","start").
#' @param end_col Name of the ending genomic position
#' column in \code{sumstats_dt} (e.g. "POS","end").
#' Can be the same as \code{start_col} when \code{sumstats_dt}
#' only contains SNPs that span 1 base pair (bp) each.
#' @param as_granges Return results as \link[GenomicRanges]{GRanges}
#' instead of a \link[data.table]{data.table} (default: \code{FALSE}).
#' @param style Style to return \link[GenomicRanges]{GRanges} object in
#' (e.g. "NCBI" = 4; "UCSC" = "chr4";) (default: \code{"NCBI"}).
#' @param verbose Print messages.
#' @inheritParams format_sumstats
#'
#' @returns Lifted summary stats in \code{data.table}
#' or \link[GenomicRanges]{GRanges} format.
#'
#' @export
#' @importFrom rtracklayer liftOver width strand end
#' @importFrom GenomeInfoDb seqnames mapGenomeBuilds
#' @importFrom GenomicRanges mcols
#' @importFrom data.table as.data.table setnames :=
#' @examples
#' sumstats_dt <- MungeSumstats::formatted_example()
#'
#' sumstats_dt_hg38 <- liftover(sumstats_dt=sumstats_dt,
#' ref_genome = "hg19",
#' convert_ref_genome="hg38")
liftover <- function(sumstats_dt,
convert_ref_genome,
ref_genome,
chain_source = "ensembl",
imputation_ind = TRUE,
chrom_col = "CHR",
start_col = "BP",
end_col = start_col,
as_granges = FALSE,
style = "NCBI",
local_chain = NULL,
verbose = TRUE) {
IMPUTATION_gen_build <- width <- strand <- end <- seqnames <- NULL;
chain_source <- tolower(chain_source)
#check chain file source
chain_msg <- paste0(
"The chosen chain file source to convert to must be one of ",
"Ensembl or UCSC ('ensembl','ucsc')"
)
if(length(chain_source)>1 || !tolower(chain_source) %in% c("ucsc",
"ensembl")){
stop(chain_msg)
}
#check local chain file
if (!is.null(local_chain) && !file.exists(local_chain)){
lcl_chain_msg <- paste0(
"The local_chain parameter is invalid, please chose a valid path to a ",
"local chain file or leave as NULL to download a chain file."
)
stop(lcl_chain_msg)
}
#Only continue with checks if user specifies a genome to convert to
if (!is.null(convert_ref_genome)){
#### Map genome build synonyms ####
query_ucsc <- if(!is.null(ref_genome)){
GenomeInfoDb::mapGenomeBuilds(genome = ref_genome)$ucscID[1]
} else {ref_genome}
target_ucsc <- if(!is.null(convert_ref_genome)){
GenomeInfoDb::mapGenomeBuilds(genome = convert_ref_genome)$ucscID[1]
} else {convert_ref_genome}
#### Check if one or more of the genomes couldn't be mapped ####
null_builds <- c("query_genome", "target_genome")[
c(is.null(query_ucsc), is.null(target_ucsc))
]
if(length(null_builds)>0){
msg <- paste0("Could not recognize genome build of:\n",
paste(" -",null_builds,collapse = "\n"),
"\nThese will be inferred from the data.")
message(msg)
}
#### Check if liftover is necessary ####
## i.e. the desired genome build isn't the current one
if ((!is.null(query_ucsc) & !is.null(target_ucsc)) &&
(query_ucsc != target_ucsc)) {
msg <- paste0(
"Performing data liftover from ", query_ucsc, " to ",
target_ucsc, "."
)
message(msg)
#### Check that liftover is available ####
## If one or more builds are NULL, this won't be evaluated bc
## the builds will be inferred instead.
if(any(!c(query_ucsc, target_ucsc) %in% c("hg38", "hg19")) ||
query_ucsc == target_ucsc) {
stop("Can only perform liftover between hg19 <---> hg38")
}
#### Convert to GRanges ####
gr <- to_granges(
sumstats_dt = sumstats_dt,
style = "UCSC",
seqnames.field = chrom_col,
start.field = start_col,
end.field = end_col
)
#### Specify chain file ####
if (is.null(local_chain)) {
if (verbose) {message("Downloading chain file...")}
chain <- get_chain_file(
from = query_ucsc,
to = target_ucsc,
chain_source = chain_source,
verbose = verbose
)
}
else {
if (verbose) {message("Using local chain file...")}
#unzip if necessary
filetype = summary(file(local_chain))$class
if(filetype=='gzfile'){
local_chain <- R.utils::gunzip(local_chain, overwrite=TRUE)
}
chain <- rtracklayer::import.chain(local_chain)
}
#### Liftover ####
gr_lifted <- unlist(rtracklayer::liftOver(
x = gr,
chain = chain
))
#### Chrom style ####
gr_lifted <- granges_style(
gr = gr_lifted,
style = style
)
#### Return format ####
if (as_granges) {
if(imputation_ind){
GenomicRanges::mcols(gr_lifted)[["IMPUTATION_gen_build"]] <-
TRUE
}
return(gr_lifted)
} else {
sumstats_dt <- data.table::as.data.table(gr_lifted)
#### rename columns back to original ####
void_cols <- c("width","strand")
void_cols <- void_cols[void_cols %in% names(sumstats_dt)]
if(length(void_cols)>0) sumstats_dt[,(void_cols):=NULL]
data.table::setnames(sumstats_dt,"seqnames","CHR")
#### Remove end_col if it was the same as start_col ####
if (start_col == end_col) {
sumstats_dt[, end := NULL]
data.table::setnames(sumstats_dt, "start", start_col)
} else {
data.table::setnames(sumstats_dt, c("start","end"),
c(start_col, end_col)
)
}
#### lastly rearrange the order again ####
sumstats_dt <- check_col_order(
sumstats_dt = sumstats_dt,
path = NULL
)$sumstats_dt
if (imputation_ind) {
sumstats_dt[, IMPUTATION_gen_build := TRUE]
}
}
}
}
return(sumstats_dt)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.