inst/doc/BiocFileCache.R

## ----setup, echo=FALSE--------------------------------------------------------
knitr::opts_chunk$set(collapse=TRUE)

## ---- eval = FALSE------------------------------------------------------------
#  if (!"BiocManager" %in% rownames(installed.packages()))
#       install.packages("BiocManager")
#  BiocManager::install("BiocFileCache", dependencies=TRUE)

## ---- results='hide', warning=FALSE, message=FALSE----------------------------
library(BiocFileCache)

## -----------------------------------------------------------------------------
path <- tempfile()
bfc <- BiocFileCache(path, ask = FALSE)

## -----------------------------------------------------------------------------
bfccache(bfc)
length(bfc)

## -----------------------------------------------------------------------------
bfc

## -----------------------------------------------------------------------------
bfcinfo(bfc)

## -----------------------------------------------------------------------------
savepath <- bfcnew(bfc, "NewResource", ext=".RData")
savepath

## now we can use that path in any save function
m = matrix(1:12, nrow=3)
save(m, file=savepath)

## and that file will be tracked in the cache
bfcinfo(bfc)

## -----------------------------------------------------------------------------
fl1 <- tempfile(); file.create(fl1)
add2 <- bfcadd(bfc, "Test_addCopy", fl1)                 # copy
# returns filepath being tracked in cache
add2
# the name is the unique rid in the cache
rid2 <- names(add2)

fl2 <- tempfile(); file.create(fl2)
add3 <- bfcadd(bfc, "Test2_addMove", fl2, action="move") # move
rid3 <- names(add3)

fl3 <- tempfile(); file.create(fl3)
add4 <- bfcadd(bfc, "Test3_addAsis", fl3, rtype="local",
	       action="asis") # reference
rid4 <- names(add4)

file.exists(fl1)    # TRUE - copied from original location
file.exists(fl2)    # FALSE - moved from original location
file.exists(fl3)    # TRUE - left asis, original location tracked

## -----------------------------------------------------------------------------
url <- "http://httpbin.org/get"
add5 <- bfcadd(bfc, "TestWeb", fpath=url)
rid5 <- names(add5)

url2<- "https://en.wikipedia.org/wiki/Bioconductor"
add6 <- bfcadd(bfc, "TestWeb", fpath=url2)
rid6 <- names(add6)

# add a remote resource but don't initially download
add7 <- bfcadd(bfc, "TestNoDweb", fpath=url2, download=FALSE)
rid7 <- names(add7)
# let's look at our BiocFileCache object now
bfc
bfcinfo(bfc)

## -----------------------------------------------------------------------------
bfcquery(bfc, "Web")

bfcquery(bfc, "copy")

q1 <- bfcquery(bfc, "wiki")
q1
class(q1)

## -----------------------------------------------------------------------------
bfccount(q1)

## -----------------------------------------------------------------------------
bfcsubWeb = bfc[paste0("BFC", 5:6)]
bfcsubWeb
bfcinfo(bfcsubWeb)

## -----------------------------------------------------------------------------
bfc[["BFC2"]]
bfcpath(bfc, "BFC2")
bfcpath(bfc, "BFC5")
bfcrpath(bfc, rids="BFC5")
bfcrpath(bfc)
bfcrpath(bfc, c("http://httpbin.org/get","Test3_addAsis"))

## -----------------------------------------------------------------------------
bfcneedsupdate(bfc, "BFC5")
bfcneedsupdate(bfc, "BFC6")
bfcneedsupdate(bfc)

## -----------------------------------------------------------------------------
fileBeingReplaced <- bfc[[rid3]]
fileBeingReplaced

# fl3 was created when we were adding resources
fl3

bfc[[rid3]]<-fl3
bfc[[rid3]]

## -----------------------------------------------------------------------------
bfcinfo(bfc, "BFC1")
bfcupdate(bfc, "BFC1", rname="FirstEntry")
bfcinfo(bfc, "BFC1")

## -----------------------------------------------------------------------------
suppressPackageStartupMessages({
    library(dplyr)
})
bfcinfo(bfc, "BFC6") %>% select(rid, rpath, fpath)
bfcupdate(bfc, "BFC6", fpath=url, rname="Duplicate", ask=FALSE)
bfcinfo(bfc, "BFC6") %>% select(rid, rpath, fpath)

## -----------------------------------------------------------------------------
rid <- "BFC5"
test <- !identical(bfcneedsupdate(bfc, rid), FALSE) # 'TRUE' or 'NA'
if (test)
    bfcdownload(bfc, rid, ask=FALSE)

## -----------------------------------------------------------------------------
names(bfcinfo(bfc))
meta <- as.data.frame(list(rid=bfcrid(bfc)[1:3], idx=1:3))
bfcmeta(bfc, name="resourceData") <- meta
names(bfcinfo(bfc))

## -----------------------------------------------------------------------------
bfcmetalist(bfc)
bfcmeta(bfc, name="resourceData")

## -----------------------------------------------------------------------------
bfcmetaremove(bfc, name="resourceData")

## ----eval=FALSE---------------------------------------------------------------
#  bfcmeta(name="resourceData") <- meta
#  Error in bfcmeta(name = "resourceData") <- meta :
#    target of assignment expands to non-language object

## ----eval=FALSE---------------------------------------------------------------
#  bfc <- BiocFileCache()
#  bfcmeta(bfc, name="resourceData") <- meta

## -----------------------------------------------------------------------------
# let's remind ourselves of our object
bfc

bfcremove(bfc, "BFC6")
bfcremove(bfc, "BFC1")

# let's look at our BiocFileCache object now
bfc

## -----------------------------------------------------------------------------
# create a new entry that hasn't been used
path <- bfcnew(bfc, "UseMe")
rmMe <- names(path)
# We also have a file not being tracked because we updated rpath

bfcsync(bfc)

# you can suppress the messages and just have a TRUE/FALSE
bfcsync(bfc, FALSE)

#
# Let's do some cleaning to have a synced object
#
bfcremove(bfc, rmMe)
unlink(fileBeingReplaced)

bfcsync(bfc)

## ----eval=FALSE---------------------------------------------------------------
#  # export entire biocfilecache
#  exportbfc(bfc)
#  
#  # export the first 4 entries of biocfilecache
#  # as a compressed tar
#  exportbfc(bfc, rids=paste0("BFC", 1:4),
#  	  outputFile="BiocFileCacheExport.tar.gz", compression="gzip")
#  
#  # export the subsetted object of web resources as zip
#  sub1 <- bfc[bfcrid(bfcquery(bfc, "web", field='rtype'))]
#  exportbfc(sub1, outputFile = "BiocFileCacheExportWeb.zip",
#  	  outMethod="zip")

## ----eval=FALSE---------------------------------------------------------------
#  
#  bfc <- importbfc("BiocFileCacheExport.tar")
#  
#  bfc2 <- importbfc("BiocFileCacheExport.tar.gz", compression="gzip")
#  
#  bfc3 <- importbfc("BiocFileCacheExportWeb.zip", archiveMethod="unzip")

## -----------------------------------------------------------------------------
tbl <- data.frame(rtype=c("web","web"),
		      rpath=c(NA_character_,NA_character_),
		  fpath=c("http://httpbin.org/get",
			  "https://en.wikipedia.org/wiki/Bioconductor"),
		      keywords = c("httpbin", "wiki"), stringsAsFactors=FALSE)
tbl

## ----eval=FALSE---------------------------------------------------------------
#  
#  newbfc <- makeBiocFileCacheFromDataFrame(tbl,
#  					 cache=file.path(tempdir(),"BFC"),
#  					 actionWeb="copy",
#  					 actionLocal="copy",
#  					 metadataName="resourceMetadata")
#  

## ----eval=FALSE---------------------------------------------------------------
#  cleanbfc(bfc)

## ----eval=FALSE---------------------------------------------------------------
#  removebfc(bfc)

## -----------------------------------------------------------------------------
## paste to avoid long line in vignette
url <- paste(
    "ftp://ftp.ensembl.org/pub/release-71/gtf",
    "homo_sapiens/Homo_sapiens.GRCh37.71.gtf.gz",
    sep="/")

## ---- eval=FALSE--------------------------------------------------------------
#  library(BiocFileCache)
#  bfc <- BiocFileCache()
#  path <- bfcrpath(bfc, url)

## ---- eval=FALSE--------------------------------------------------------------
#  gtf <- rtracklayer::import.gff(path)

## ---- eval=FALSE--------------------------------------------------------------
#  gtf <- rtracklayer::import.gff(bfcrpath(BiocFileCache(), url))

## ---- eval=FALSE--------------------------------------------------------------
#  library(BiocFileCache)
#  bfc <- BiocFileCache("~/my-experiment/results")

## ---- eval=FALSE--------------------------------------------------------------
#  suppressPackageStartupMessages({
#      library(DESeq2)
#      library(airway)
#  })
#  data(airway)
#  dds <- DESeqDataData(airway, design = ~ cell + dex)
#  result <- DESeq(dds)

## ---- eval=FALSE--------------------------------------------------------------
#  saveRDS(result, bfcnew(bfc, "airway / DESeq standard analysis"))

## ---- eval=FALSE--------------------------------------------------------------
#  result <- readRDS(bfcrpath(bfc, "airway / DESeq standard analysis"))

## ----eval=FALSE---------------------------------------------------------------
#  suppressPackageStartupMessages({
#      library(BiocFileCache)
#      library(rtracklayer)
#  })
#  
#  # load the cache
#  path <- file.path(tempdir(), "tempCacheDir")
#  bfc <- BiocFileCache(path)
#  
#  # the web resource of interest
#  url <- "ftp://ftp.ensembl.org/pub/release-71/gtf/homo_sapiens/Homo_sapiens.GRCh37.71.gtf.gz"
#  
#  # check if url is being tracked
#  res <- bfcquery(bfc, url)
#  
#  if (bfccount(res) == 0L) {
#  
#      # if it is not in cache, add
#      ans <- bfcadd(bfc, rname="ensembl, homo sapien", fpath=url)
#  
#  } else {
#  
#    # if it is in cache, get path to load
#    rid = res %>% filter(fpath == url) %>% collect(Inf) %>% `[[`("rid")
#    ans <- bfcrpath(bfc, rid)
#  
#    # check to see if the resource needs to be updated
#    check <- bfcneedsupdate(bfc, rid)
#    # check can be NA if it cannot be determined, choose how to handle
#    if (is.na(check)) check <- TRUE
#    if (check){
#      ans < - bfcdownload(bfc, rid)
#    }
#  }
#  
#  # ans is the path of the file to load
#  ans
#  
#  # we know because we search for the url that the file is a .gtf.gz,
#  # if we searched on other terms we can use 'bfcpath' to see the
#  # original fpath to know the appropriate load/read/import method
#  bfcpath(bfc, names(ans))
#  
#  temp = GTFFile(ans)
#  info = import(temp)

## ----eval=TRUE----------------------------------------------------------------

#
# A simplier test to see if something is in the cache
# and if not start tracking it is using `bfcrpath`
#

suppressPackageStartupMessages({
    library(BiocFileCache)
    library(rtracklayer)
})

# load the cache
path <- file.path(tempdir(), "tempCacheDir")
bfc <- BiocFileCache(path, ask=FALSE)

# the web resources of interest
url <- "ftp://ftp.ensembl.org/pub/release-71/gtf/homo_sapiens/Homo_sapiens.GRCh37.71.gtf.gz"

url2 <- "ftp://ftp.ensembl.org/pub/release-71/gtf/rattus_norvegicus/Rattus_norvegicus.Rnor_5.0.71.gtf.gz"

# if not in cache will download and create new entry
pathsToLoad <- bfcrpath(bfc, c(url, url2))

pathsToLoad

# now load files as see fit
info = import(GTFFile(pathsToLoad[1]))
class(info)
summary(info)

## ----eval=FALSE---------------------------------------------------------------
#  #
#  # One could also imagine the following:
#  #
#  
#  library(BiocFileCache)
#  
#  # load the cache
#  bfc <- BiocFileCache()
#  
#  #
#  # Do some work!
#  #
#  
#  # add a location in the cache
#  filepath <- bfcnew(bfc, "R workspace")
#  
#  save(list = ls(), file=filepath)
#  
#  # now the R workspace is being tracked in the cache

## ---- eval=FALSE--------------------------------------------------------------
#  .get_cache <-
#      function()
#  {
#      cache <- rappdirs::user_cache_dir(appname="MyNewPackage")
#      BiocFileCache::BiocFileCache(cache)
#  }

## ---- eval=FALSE--------------------------------------------------------------
#  download_data_file <-
#      function( verbose = FALSE )
#  {
#      fileURL <- "http://a_path_to/someremotefile.tsv.gz"
#  
#      bfc <- .get_cache()
#      rid <- bfcquery(bfc, "geneFileV2", "rname")$rid
#      if (!length(rid)) {
#  	 if( verbose )
#  	     message( "Downloading GENE file" )
#  	 rid <- names(bfcadd(bfc, "geneFileV2", fileURL ))
#      }
#      if (!isFALSE(bfcneedsupdate(bfc, rid)))
#  	bfcdownload(bfc, rid)
#  
#      bfcrpath(bfc, rids = rid)
#  }

## -----------------------------------------------------------------------------
url <- "http://bioconductor.org/packages/stats/bioc/BiocFileCache/BiocFileCache_stats.tab"

headFile <-                         # how to process file before caching
    function(from, to)
{
    dat <- readLines(from)
    writeLines(head(dat), to)
    TRUE
}

rid <- bfcquery(bfc, url, "fpath")$rid
if (!length(rid))                   # not in cache, add but do not download
    rid <- names(bfcadd(bfc, url, download = FALSE))

update <- bfcneedsupdate(bfc, rid)  # TRUE if newly added or stale
if (!isFALSE(update))               # download & process
    bfcdownload(bfc, rid, ask = FALSE, FUN = headFile)

rpath <- bfcrpath(bfc, rids=rid)    # path to processed result
readLines(rpath)                    # read processed result

Try the BiocFileCache package in your browser

Any scripts or data that you put into this service are public.

BiocFileCache documentation built on Nov. 8, 2020, 5:06 p.m.