From 49acf32e346d708fb75365ed61f9ddacf40adb50 Mon Sep 17 00:00:00 2001 From: J Wokaty Date: Tue, 14 Jan 2025 11:00:20 -0500 Subject: [PATCH] Make cache structure hierarchical --- R/bedbaser.R | 93 ++++++++++++++++++++++++---------- R/cache.R | 50 +++++++++++++++--- R/utils.R | 80 +++++++++++++++++------------ man/BEDbase.Rd | 19 +++++-- man/getCache-BEDbase-method.Rd | 6 +-- man/setCache-BEDbase-method.Rd | 4 +- tests/testthat/test-bedbaser.R | 4 +- tests/testthat/test-cache.R | 26 ++++++---- tests/testthat/test-utils.R | 18 ++++--- vignettes/bedbaser.Rmd | 25 +++++---- 10 files changed, 220 insertions(+), 105 deletions(-) diff --git a/R/bedbaser.R b/R/bedbaser.R index fdc5e37..cec5ebf 100644 --- a/R/bedbaser.R +++ b/R/bedbaser.R @@ -5,7 +5,7 @@ #' @return BEDbase class instance .BEDbase <- setClass( "BEDbase", - slots = c("cache"), + slots = c("bedfiles", "bedsets"), contains = "Service" ) @@ -15,9 +15,18 @@ #' #' @title An R client for BEDbase #' -#' @description bedbaser exposes the BEDbase API and includes convenience -#' functions for common tasks, such as to import a BED file by `id` into a -#' `GRanges` object and a BEDset by its `id` into a `GRangesList`. +#' @description bedbaser exposes the [BEDbase API](https://api.bedbase.org) +#' and includes convenience functions for common tasks, such as to import a +#' BED file by `id` into a `GRanges` object and a BEDset by its `id` into a +#' `GRangesList`. +#' +#' \code{BEDbase()} creates a cache similar to that of the +#' [Geniml BBClient's cache](https://docs.bedbase.org/geniml): +#' cache_path +#' bedfiles +#' a/f/afile.bed.gz +#' bedsets +#' a/s/aset.txt #' #' The convenience functions are as follows #' * `bedbaser::BEDbase()`: API service constructor @@ -47,10 +56,18 @@ BEDbase <- function(cache_path, quietly = FALSE) { if (missing(cache_path)) { cache_path <- tools::R_user_dir("bedbaser", which = "cache") + } else if (!dir.exists(cache_path)) { + dir.create(file.path(cache_path, "bedfiles"), recursive = TRUE) + dir.create(file.path(cache_path, "bedsets"), recursive = TRUE) } bedbase <- suppressWarnings( .BEDbase( - cache = BiocFileCache::BiocFileCache(cache_path), + bedfiles = BiocFileCache::BiocFileCache( + file.path(cache_path, "bedfiles") + ), + bedsets = BiocFileCache::BiocFileCache( + file.path(cache_path, "bedsets") + ), AnVIL::Service( service = "bedbase", host = "api.bedbase.org", @@ -63,7 +80,8 @@ BEDbase <- function(cache_path, quietly = FALSE) { ) ) info <- httr::content( - bedbase$list_beds_v1_bed_list_get(limit = 0, offset = 0)) + bedbase$list_beds_v1_bed_list_get(limit = 0, offset = 0) + ) if (!quietly) { message(info$count, " BED files available.") } @@ -73,30 +91,35 @@ BEDbase <- function(cache_path, quietly = FALSE) { #' @rdname BEDbase #' #' @param x BEDbase(1) object -#' @param quietly logical(1) (default \code{TRUE}) display messages +#' @param type character(1) bedfiles or bedsets #' #' @export -setGeneric("getCache", function(x, quietly = TRUE) standardGeneric("getCache")) +setGeneric( + "getCache", + function(x, cache_type = c("bedfiles", "bedsets")) standardGeneric("getCache") +) #' Return cache path #' #' @param x BEDbase(1) object -#' @param quietly logical(1) (default \code{TRUE}) display messages +#' @param cache_type character(1) bedfiles or bedsets #' #' @return BiocFileCache(1) object of BED files #' #' @examples #' bedbase <- BEDbase(tempdir()) -#' getCache(bedbase) +#' getCache(bedbase, "bedfiles") #' #' @export setMethod( "getCache", "BEDbase", - function(x, quietly = TRUE) { - if (quietly) { - BiocFileCache::bfcinfo(x@cache) + function(x, cache_type = c("bedfiles", "bedsets")) { + cache_type <- match.arg(cache_type) + if (cache_type == "bedsets") { + x@bedsets + } else { + x@bedfiles } - x@cache } ) @@ -112,7 +135,7 @@ setGeneric( function(x, cache_path, quietly = TRUE) standardGeneric("setCache") ) -#' Set cache path +#' Set cache along path #' #' @param x BEDbase(1) object #' @param cache_path character(1) @@ -128,10 +151,14 @@ setGeneric( setMethod( "setCache", "BEDbase", function(x, cache_path, quietly = TRUE) { - x@cache <- BiocFileCache::BiocFileCache(cache_path) - if (quietly) { - BiocFileCache::bfcinfo(x@cache) - } + x@bedfiles <- BiocFileCache::BiocFileCache( + file.path(cache_path, "bedfiles"), + !quietly + ) + x@bedsets <- BiocFileCache::BiocFileCache( + file.path(cache_path, "bedsets"), + !quietly + ) x } ) @@ -285,8 +312,9 @@ bb_metadata <- function(bedbase, id, full = FALSE) { #' bb_list_beds(bedbase) #' #' @export -bb_list_beds <- function(bedbase, genome = NULL, bed_type = NULL, limit = 1000, - offset = 0) { +bb_list_beds <- function( + bedbase, genome = NULL, bed_type = NULL, limit = 1000, + offset = 0) { rsp <- bedbase$list_beds_v1_bed_list_get( genome = genome, bed_type = bed_type, limit = limit, offset = offset @@ -422,12 +450,22 @@ bb_bed_text_search <- function(bedbase, query, limit = 10, offset = 0) { #' bb_to_granges(bedbase, ex_bed$id) #' #' @export -bb_to_granges <- function(bedbase, bed_id, file_type = "bed", extra_cols = NULL, - quietly = TRUE) { +bb_to_granges <- function( + bedbase, bed_id, file_type = "bed", extra_cols = NULL, + quietly = TRUE) { stopifnot(file_type %in% c("bed", "bigbed")) metadata <- bb_metadata(bedbase, bed_id, TRUE) - file_path <- .get_file(metadata, getCache(bedbase), file_type, "http", - quietly) + file_path <- .get_file( + metadata, getCache(bedbase, "bedfiles"), file_type, + "http", quietly + ) + + bed_file <- tryCatch( + R.utils::gunzip(file_path, remove = FALSE), + error = function(e) { + gsub(".gz", "", file_path) + } + ) if (file_type == "bed") { .bed_file_to_granges(file_path, metadata, extra_cols, quietly) @@ -489,8 +527,9 @@ bb_to_grangeslist <- function(bedbase, bedset_id, quietly = TRUE) { #' bb_save(bedbase, ex_bed$id, tempdir()) #' #' @export -bb_save <- function(bedbase, bed_or_bedset_id, path, file_type = "bed", - access_type = "http", quietly = TRUE) { +bb_save <- function( + bedbase, bed_or_bedset_id, path, file_type = "bed", + access_type = "http", quietly = TRUE) { if (!dir.exists(path)) { rlang::abort(paste(path, "doesn't exist.", sep = " ")) } diff --git a/R/cache.R b/R/cache.R index a9067b7..a5ea7a4 100644 --- a/R/cache.R +++ b/R/cache.R @@ -1,9 +1,42 @@ +#' Create nested path +#' +#' @description Create directory structure following the BEDbase url structure +#' by creating nested directories in the cache. +#' +#' @param file_name character(1) BEDbase file name +#' @param bfc BiocFileCache(1) object +#' +#' @return character(1) file path +#' +#' @examples +#' bedbase_url <- paste0( +#' "https://data2.bedbase.org/files/2/6/", +#' "26a57da7c732a8e63a1dda7ea18af021.bed.gz" +#' ) +#' .create_nested_path(bedbase_url, bfc) +#' # [1] "2/6/26a57da7c732a8e63a1dda7ea18af021.bed.gz" +#' +#' @noRd +.create_nested_path <- function(bedbase_url, bfc) { + file_name <- .get_file_name(bedbase_url) + bfc_path <- BiocFileCache::bfccache(bfc) + nested_path <- file.path(bfc_path, substr(file_name, 1, 1)) + if (!dir.exists(nested_path)) { + dir.create(nested_path) + } + nested_path <- file.path(nested_path, substr(file_name, 2, 2)) + if (!dir.exists(nested_path)) { + dir.create(nested_path) + } + file.path(nested_path, file_name) +} + #' Retrieve path from cache or download file and cache #' #' This function is described in the BiocFileCache vignette. #' #' @param id character(1) BEDbase id -#' @param url character(1) remote resource +#' @param bedbase_url character(1) remote resource #' @param bfc BiocFileCache(1) object #' @param quietly logical(1) (default \code{TRUE}) display message #' @@ -11,22 +44,25 @@ #' #' @examples #' id <- "233479aab145cffe46221475d5af5fae" -#' url <- paste0( +#' bedbase_url <- paste0( #' "https://data2.bedbase.org/files/2/6/", #' "26a57da7c732a8e63a1dda7ea18af021.bed.gz" #' ) -#' .download_to_cache(id, url, BiocFileCache::BiocFileCache(tempdir())) +#' .download_to_cache(id, bedbase_url, BiocFileCache::BiocFileCache(tempdir())) #' #' @noRd -.download_to_cache <- function(id, url, bfc, quietly = TRUE) { +.download_to_cache <- function(id, bedbase_url, bfc, quietly = TRUE) { rid <- BiocFileCache::bfcquery(bfc, id, "rname")$rid if (!length(rid)) { if (!quietly) { - rlang::inform(paste("Downloading", url, "...")) + rlang::inform(paste("Downloading", bedbase_url, "...")) } + rpath <- .create_nested_path(bedbase_url, bfc) + download.file(bedbase_url, rpath, quiet = quietly) rid <- names(BiocFileCache::bfcadd(bfc, - rname = id, fpath = url, rtype = "web", - download = TRUE, verbose = !quietly + rname = id, + fpath = rpath, rtype = "local", rpath = rpath, + download = FALSE, action = "asis", verbose = !quietly )) } BiocFileCache::bfcrpath(bfc, rids = rid) diff --git a/R/utils.R b/R/utils.R index a140c99..6b8d419 100644 --- a/R/utils.R +++ b/R/utils.R @@ -1,25 +1,53 @@ -#' Format BED file metadata +#' Get file name from URL for a file +#' +#' @param a_url character(1) URL +#' +#' @return character(1) file name +#' +#' @examples +#' url <- "https://this/is/an/example" +#' +#' @noRd +.get_file_name <- function(a_url) { + url_parts <- unlist(strsplit(a_url, "/")) + url_parts[length(url_parts)] +} + +#' Get BEDbase url for BED file #' #' @param records list(1) metadata +#' @param file_type character(1) bed or bigbed +#' @param access_type character(1) s3 or http #' -#' @return tibble(1) file metadata +#' @return url to BED file #' #' @examples #' bedbase <- BEDbase() #' ex_bed <- bb_example(bedbase, "bed") #' ex_metadata <- bb_metadata(bedbase, ex_bed$id, TRUE) -#' .format_metadata_files(ex_bed$files) +#' .get_url(ex_bed$files, "bed", "http") #' #' @noRd -.format_metadata_files <- function(metadata) { - dplyr::bind_rows(metadata) |> +.get_url <- function( + metadata, file_type = c("bed", "bigbed"), + access_type = c("s3", "http")) { + file_type <- match.arg(file_type) + access_type <- match.arg(access_type) + file_details <- dplyr::bind_rows(metadata$files) |> tidyr::unnest_wider(access_methods) |> - tidyr::unnest_wider(access_url) + tidyr::unnest_wider(access_url) |> + dplyr::filter( + name == paste(file_type, "file", sep = "_"), + access_id == access_type + ) + file_details$url } -#' Save a file from BEDbase to the cache or a path +#' Get a BED file #' -#' Will create directories that do not exist when saving +#' @description Download or retrieve the file the cache. If not available, get +#' the file from bedbase.org and save to the cache or a path. If a directory +#' does not exist along specified path, it will raise an error message. #' #' @param metadata list(1) full metadata #' @param cache_or_path BiocFileCache(1) or character(1) cache or save path @@ -36,33 +64,20 @@ #' .get_file(md, tempdir(), "bed", "http") #' #' @noRd -.get_file <- function(metadata, cache_or_path, file_type = c("bed", "bigbed"), - access_type = c("s3", "http"), quietly = TRUE) { - file_details <- .format_metadata_files(metadata$files) |> - dplyr::filter( - name == paste(file_type, "file", sep = "_"), - access_id == access_type - ) +.get_file <- function( + metadata, cache_or_path, file_type, access_type, + quietly = TRUE) { + file_url <- .get_url(metadata, file_type, access_type) if (methods::is(cache_or_path, "BiocFileCache")) { - cached_file <- .download_to_cache( - metadata$id, file_details$url, + bed_file <- .download_to_cache( + metadata$id, file_url, cache_or_path, quietly ) - bedbase_file <- tryCatch( - R.utils::gunzip(cached_file, remove = FALSE), - error = function(e) { - gsub(".gz", "", cached_file) - } - ) } else { - if (!dir.exists(cache_or_path)) { - dir.create(cache_or_path, recursive = TRUE) - } - url_parts <- unlist(strsplit(file_details$url, "/")) - bedbase_file <- file.path(cache_or_path, url_parts[length(url_parts)]) - utils::download.file(file_details$url, bedbase_file, quiet = quietly) + bed_file <- file.path(cache_or_path, .get_file_name(file_url)) + utils::download.file(file_url, bed_file, quiet = quietly) } - bedbase_file + bed_file } #' Get extra_cols @@ -152,8 +167,9 @@ #' .bed_file_to_granges(file_path, md) #' #' @noRd -.bed_file_to_granges <- function(file_path, metadata, extra_cols = NULL, - quietly = TRUE) { +.bed_file_to_granges <- function( + file_path, metadata, extra_cols = NULL, + quietly = TRUE) { args <- list(con = file_path) args["format"] <- gsub("peak", "Peak", metadata$bed_format) nums <- stringr::str_replace(metadata$bed_type, "bed", "") |> diff --git a/man/BEDbase.Rd b/man/BEDbase.Rd index 4a7f344..6ba3533 100644 --- a/man/BEDbase.Rd +++ b/man/BEDbase.Rd @@ -8,7 +8,7 @@ \usage{ BEDbase(cache_path, quietly = FALSE) -getCache(x, quietly = TRUE) +getCache(x, cache_type = c("bedfiles", "bedsets")) setCache(x, cache_path, quietly = TRUE) } @@ -18,14 +18,25 @@ setCache(x, cache_path, quietly = TRUE) \item{quietly}{logical(1) (default \code{TRUE}) display messages} \item{x}{BEDbase(1) object} + +\item{type}{character(1) bedfiles or bedsets} } \value{ BEDbase object } \description{ -bedbaser exposes the BEDbase API and includes convenience -functions for common tasks, such as to import a BED file by \code{id} into a -\code{GRanges} object and a BEDset by its \code{id} into a \code{GRangesList}. +bedbaser exposes the \href{https://api.bedbase.org}{BEDbase API} +and includes convenience functions for common tasks, such as to import a +BED file by \code{id} into a \code{GRanges} object and a BEDset by its \code{id} into a +\code{GRangesList}. + +\code{BEDbase()} creates a cache similar to that of the +\href{https://docs.bedbase.org/geniml}{Geniml BBClient's cache}: +cache_path +bedfiles +a/f/afile.bed.gz +bedsets +a/s/aset.txt The convenience functions are as follows \itemize{ diff --git a/man/getCache-BEDbase-method.Rd b/man/getCache-BEDbase-method.Rd index 633c442..eb27cbf 100644 --- a/man/getCache-BEDbase-method.Rd +++ b/man/getCache-BEDbase-method.Rd @@ -4,12 +4,12 @@ \alias{getCache,BEDbase-method} \title{Return cache path} \usage{ -\S4method{getCache}{BEDbase}(x, quietly = TRUE) +\S4method{getCache}{BEDbase}(x, cache_type = c("bedfiles", "bedsets")) } \arguments{ \item{x}{BEDbase(1) object} -\item{quietly}{logical(1) (default \code{TRUE}) display messages} +\item{cache_type}{character(1) bedfiles or bedsets} } \value{ BiocFileCache(1) object of BED files @@ -19,6 +19,6 @@ Return cache path } \examples{ bedbase <- BEDbase(tempdir()) -getCache(bedbase) +getCache(bedbase, "bedfiles") } diff --git a/man/setCache-BEDbase-method.Rd b/man/setCache-BEDbase-method.Rd index 2c73488..107d040 100644 --- a/man/setCache-BEDbase-method.Rd +++ b/man/setCache-BEDbase-method.Rd @@ -2,7 +2,7 @@ % Please edit documentation in R/bedbaser.R \name{setCache,BEDbase-method} \alias{setCache,BEDbase-method} -\title{Set cache path} +\title{Set cache along path} \usage{ \S4method{setCache}{BEDbase}(x, cache_path, quietly = TRUE) } @@ -17,7 +17,7 @@ BiocFileCache(1) object of BED files } \description{ -Set cache path +Set cache along path } \examples{ bedbase <- BEDbase(tempdir()) diff --git a/tests/testthat/test-bedbaser.R b/tests/testthat/test-bedbaser.R index 6ce448e..7bf6537 100644 --- a/tests/testthat/test-bedbaser.R +++ b/tests/testthat/test-bedbaser.R @@ -1,9 +1,9 @@ test_that("setCache changes cache", { bedbase <- BEDbase(quietly = TRUE) path <- tempdir() - expect_true(BiocFileCache::bfccache(getCache(bedbase)) != path) + expect_true(BiocFileCache::bfccache(getCache(bedbase, "bedfiles")) != file.path(path, "bedfiles")) bedbase <- setCache(bedbase, path) - expect_true(BiocFileCache::bfccache(getCache(bedbase)) == path) + expect_true(BiocFileCache::bfccache(getCache(bedbase, "bedfiles")) == file.path(path, "bedfiles")) }) test_that("bb_example has bed_format of 'bed' given rec_type 'bed'", { diff --git a/tests/testthat/test-cache.R b/tests/testthat/test-cache.R index d5fd364..759059b 100644 --- a/tests/testthat/test-cache.R +++ b/tests/testthat/test-cache.R @@ -1,23 +1,31 @@ +test_that("nested path is created", { + bedbase <- BEDbase(quietly = TRUE) + bfc <- getCache(bedbase, "bedfiles") + bfc_path <- BiocFileCache::bfccache(bfc) + bedbase_url <- "http://an_example_url/2468.bed.gz" + expect_equal( + .create_nested_path(bedbase_url, bfc), + file.path(bfc_path, "2/4/2468.bed.gz") + ) +}) + test_that("default cache location is used", { path <- tools::R_user_dir("bedbaser", which = "cache") bedbase <- BEDbase(quietly = TRUE) id <- "bbad85f21962bb8d972444f7f9a3a932" gro <- bb_to_granges(bedbase, id, "bed") - bfc <- BiocFileCache::BiocFileCache(path) - expect_equal(BiocFileCache::bfccache(bfc), path) + bfc <- BiocFileCache::BiocFileCache(file.path(path, "bedfiles")) + expect_equal(BiocFileCache::bfccache(bfc), file.path(path, "bedfiles")) md <- bb_metadata(bedbase, id, TRUE) - file_path <- .get_file(md, getCache(bedbase), "bed", "http") - expect_true(paste0(file_path, ".gz") %in% BiocFileCache::bfcinfo(bfc)$rpath) + file_path <- .get_file(md, getCache(bedbase, "bedfiles"), "bed", "http") + expect_true(file_path %in% BiocFileCache::bfcinfo(bfc)$rpath) }) test_that("path is used if set when calling constructor", { path <- tempdir() bedbase <- BEDbase(path, TRUE) - bfc <- getCache(bedbase) + bfc <- getCache(bedbase, "bedfiles") id <- "bbad85f21962bb8d972444f7f9a3a932" gro <- bb_to_granges(bedbase, id, "bed") - expect_equal(BiocFileCache::bfccache(bfc), path) - md <- bb_metadata(bedbase, id, TRUE) - file_path <- .get_file(md, getCache(bedbase), "bed", "http") - expect_true(paste0(file_path, ".gz") %in% BiocFileCache::bfcinfo(bfc)$rpath) + expect_equal(BiocFileCache::bfccache(bfc), file.path(path, "bedfiles")) }) diff --git a/tests/testthat/test-utils.R b/tests/testthat/test-utils.R index 1b6b626..c20dedd 100644 --- a/tests/testthat/test-utils.R +++ b/tests/testthat/test-utils.R @@ -1,10 +1,16 @@ -test_that(".format_metadata_files returns a tibble with a url column", { +test_that(".get_file_name returns file name", { + expect_equal( + .get_file_name("https://this/is/an/example"), + "example" + ) +}) + +test_that(".get_url returns a url", { bedbase <- BEDbase(quietly = TRUE) ex_bed <- bb_example(bedbase, "bed") ex_md <- bb_metadata(bedbase, ex_bed$id, TRUE) - mdf <- .format_metadata_files(ex_md$files) - expect_true(methods::is((mdf)[1], "tbl_df")) - expect_true("url" %in% names(mdf)) + file_url <- .get_url(ex_md, "bed", "http") + expect_true(stringr::str_detect(file_url, "(https?|ftp|s3)://")) }) test_that(".get_file returns a valid file path", { @@ -14,7 +20,7 @@ test_that(".get_file returns a valid file path", { temp_path <- tempdir() file_path <- .get_file(md, temp_path, "bed", "http") expect_true(file.exists(file_path)) - file_path <- .get_file(md, getCache(bedbase), "bed", "http") + file_path <- .get_file(md, getCache(bedbase, "bedfiles"), "bed", "http") expect_true(file.exists(file_path)) }) @@ -22,7 +28,7 @@ test_that(".get_extra_cols returns a named vector", { bedbase <- BEDbase(quietly = TRUE) id <- "608827efc82fcaa4b0bfc65f590ffef8" md <- bb_metadata(bedbase, id, TRUE) - file_path <- .get_file(md, getCache(bedbase), "bed", "http") + file_path <- .get_file(md, getCache(bedbase, "bedfiles"), "bed", "http") extra_cols <- .get_extra_cols(file_path, 3, 9) expect_equal(9, length(extra_cols)) }) diff --git a/vignettes/bedbaser.Rmd b/vignettes/bedbaser.Rmd index d533032..6266ecd 100644 --- a/vignettes/bedbaser.Rmd +++ b/vignettes/bedbaser.Rmd @@ -36,25 +36,25 @@ if (!"BiocManager" %in% rownames(installed.packages())) { BiocManager::install("bedbaser") ``` -Load the package and create a BEDbase instance. +Load the package and create a BEDbase instance, optionally setting the cache +to `cache_path`. If `cache_path` is not set, bedbaser will choose the +default location. ```{r bedbase} library(bedbaser) bedbase <- BEDbase() ``` +bedbaser can use the same cache as [geniml](https://docs.bedbase.org/geniml/)'s +BBClient by setting the `cache_path` to the same location. It will create +the following structure: -## (Optional) Set the cache - -Set the cache path with the argument `cache_path`. If `cache_path` is not set, -bedbaser will choose the default location. bedbaser can use the same cache as -`bbclient` available through the genomic interval machine learning toolkit -`geniml` by setting the `cache_path` to the same location. - -```{r set_cache, eval = FALSE} -library(bedbaser) - -bedbase <- BEDbase(cache_path = "/path/to/cache") +``` +cache_path + bedfiles + a/f/afile.bed.gz + bedsets + a/s/aset.txt ``` # Convenience Functions @@ -109,7 +109,6 @@ downloads and imports a BED file using `r Biocpkg("rtracklayer")`. ```{r bb_to_granges} ex_bed <- bb_example(bedbase, "bed") -head(ex_bed) # Allow bedbaser to assign column names and types bb_to_granges(bedbase, ex_bed$id, quietly = FALSE) ```