R/get_chain_file.R

Defines functions .get_ucsc_chain_remote .get_ensembl_chain_remote get_chain_file

Documented in get_chain_file

#' Download chain file for liftover
#'
#' @source \href{https://hgdownload.cse.ucsc.edu/goldenpath/hg19/liftOver/}{
#' UCSC chain files}
#' @source \href{https://ftp.ensembl.org/pub/assembly_mapping/homo_sapiens/}{
#' Ensembl chain files}
#' @param from genome build converted from ("hg38", "hg19")
#' @param to genome build converted to ("hg19", "hg38")
#' @param chain_source chain file source used ("ucsc" as default, or "ensembl")
#' @param save_dir where is the chain file saved? Default is a temp directory
#' @param verbose extra messages printed? Default is TRUE
#' @return loaded chain file for liftover
#' @keywords internal
#' @importFrom utils download.file
#' @importFrom R.utils gunzip copyFile
#' @importFrom rtracklayer import.chain
get_chain_file <- function(from = c("hg38", "hg19"),
                           to = c("hg19", "hg38"),
                           chain_source = c("ucsc", "ensembl"),
                           save_dir = tempdir(),
                           verbose = TRUE) {
  #### Match args ####
  from <- match.arg(from)
  to <- match.arg(to)
  chain_source <- match.arg(chain_source)
  if(from == to){
    stop("Cannot get a chain file from one reference to the same.")
  }
  if(chain_source == "ucsc"){
    message("Note that you are fetching the UCSC chain file, ",
            "which requires a licence for commercial use.")
  }
  
  #### Define paths ####
  remote_path = switch(
    chain_source,
    "ensembl" = .get_ensembl_chain_remote(from, to),
    "ucsc" = .get_ucsc_chain_remote(from, to)
  )
  local_path <- file.path(save_dir, basename(remote_path))
  local_path_gunzip <- gsub(".gz", "", local_path)
  ### Download chain file ####
  if(file.exists(local_path_gunzip)){
    if(verbose)
      messager(sprintf("Using existing chain file from %s.", 
                       chain_source),v=verbose)
  }
  else{
    source_readable = ifelse(chain_source == "ensembl", "Ensembl", 
                             "UCSC Genome Browser")
    if(verbose)
      messager(sprintf("Downloading chain file from %s.", 
                       source_readable),
               v=verbose)
    error_dwnld <-
      tryCatch(utils::download.file(remote_path, local_path),
               error = function(e){e},
               warning = function(w){w}
      )
    #if download failed use file in package
    if(is(error_dwnld,"warning")||is(error_dwnld,"error")||
       ("message" %in% names(error_dwnld) &&
        (grepl("Couldn't connect to server",error_dwnld$message)||
         grepl("Couldn't resolve host name",error_dwnld$message)||
         grepl("cannot open URL",error_dwnld$message)))
    ){
      if(chain_source != "ensembl")
        chain_file <- basename(.get_ucsc_chain_remote(from, to))
      else
        chain_file <- basename(.get_ensembl_chain_remote(from, to))
      #download.file will create an empty file even if download fails
      if(file.exists(local_path)){
        rmvd <- file.remove(local_path)
        msg <- paste0(
          sprintf("Download of chain file from %s ", source_readable),
          "failed, use ensembl chain file (chain_source='ensembl') ",
          "\nfor an offline solution.")
        if(chain_source!='ensembl')
          stop(msg)
        copied <- R.utils::copyFile(
          srcPathname = system.file("extdata",chain_file,
                                    package="MungeSumstats"),
          save_dir)
        msg2 <- paste0(
          sprintf("Download of chain file from %s ", source_readable),
          "failed, using ensembl chain file downloaded on 2022-11-25 ",
          "instead.")
        if(chain_source=='ensembl'){
          messager(msg2)
          #local_path = 
        }  
      } 
    }
    messager(local_path,v=verbose)
    local_path <- R.utils::gunzip(local_path, overwrite=TRUE)
    local_path_gunzip <- local_path
  }
  #### Import ####
  #NOT NECESSARY WITH FIX TO rtracklayer - v1.59.1
  #(https://github.com/lawremi/rtracklayer/issues/23
  # Ensembl format is slightly different to UCSC, rtracklayer can't handle 
  # the spaces rather than tabs. Solution here as per RoelKluin
  # https://github.com/lawremi/rtracklayer/issues/23
  #if(chain_source == "ensembl"){
  #    new_path = gsub(".chain", "_tabs.chain", local_path_gunzip, fixed=TRUE)
  #    if(!file.exists(new_path)){
  #        system(sprintf(
  #            "sed -E 's/^([0-9]+) ([0-9]+) ([0-9]+)$/\\1\\t\\2\\t\\3/' %s > %s",
  #            local_path_gunzip, new_path)
  #        )
  #    }
  #    local_path_gunzip <- new_path
  #}
  chain <- rtracklayer::import.chain(local_path_gunzip)
  return(chain)
}

.get_ensembl_chain_remote <- function(from = c("hg38", "hg19"),
                                      to = c("hg19", "hg38")) {
  base <- "ftp://ftp.ensembl.org/pub/assembly_mapping/homo_sapiens/"
  ens_convert <- c("hg38" = "GRCh38", "hg19" = "GRCh37")
  ens_from <- ens_convert[from]
  ens_to <- ens_convert[to]
  return(
    paste0(base, ens_from, "_to_", ens_to, ".chain.gz")
  )
}

.get_ucsc_chain_remote <- function(from = c("hg38", "hg19"),
                                   to = c("hg19", "hg38")) {
  base <- "ftp://hgdownload.cse.ucsc.edu/goldenPath/"
  to_caps <- paste0(toupper(substr(to, 1, 1)),substr(to, 2, nchar(to)))
  return(
    paste0(base, from, "/liftOver/", from, "To", to_caps, ".over.chain.gz")
  )
}
neurogenomics/MungeSumstats documentation built on Aug. 10, 2024, 5:59 a.m.