Skip to content

Commit

Permalink
Make cache structure hierarchical
Browse files Browse the repository at this point in the history
  • Loading branch information
jwokaty committed Jan 14, 2025
1 parent 115745a commit 49acf32
Show file tree
Hide file tree
Showing 10 changed files with 220 additions and 105 deletions.
93 changes: 66 additions & 27 deletions R/bedbaser.R
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
#' @return BEDbase class instance
.BEDbase <- setClass(
"BEDbase",
slots = c("cache"),
slots = c("bedfiles", "bedsets"),
contains = "Service"
)

Expand All @@ -15,9 +15,18 @@
#'
#' @title An R client for BEDbase
#'
#' @description bedbaser exposes the BEDbase API and includes convenience
#' functions for common tasks, such as to import a BED file by `id` into a
#' `GRanges` object and a BEDset by its `id` into a `GRangesList`.
#' @description bedbaser exposes the [BEDbase API](https://api.bedbase.org)
#' and includes convenience functions for common tasks, such as to import a
#' BED file by `id` into a `GRanges` object and a BEDset by its `id` into a
#' `GRangesList`.
#'
#' \code{BEDbase()} creates a cache similar to that of the
#' [Geniml BBClient's cache](https://docs.bedbase.org/geniml):
#' cache_path
#' bedfiles
#' a/f/afile.bed.gz
#' bedsets
#' a/s/aset.txt
#'
#' The convenience functions are as follows
#' * `bedbaser::BEDbase()`: API service constructor
Expand Down Expand Up @@ -47,10 +56,18 @@
BEDbase <- function(cache_path, quietly = FALSE) {
if (missing(cache_path)) {
cache_path <- tools::R_user_dir("bedbaser", which = "cache")
} else if (!dir.exists(cache_path)) {
dir.create(file.path(cache_path, "bedfiles"), recursive = TRUE)
dir.create(file.path(cache_path, "bedsets"), recursive = TRUE)
}
bedbase <- suppressWarnings(
.BEDbase(
cache = BiocFileCache::BiocFileCache(cache_path),
bedfiles = BiocFileCache::BiocFileCache(
file.path(cache_path, "bedfiles")
),
bedsets = BiocFileCache::BiocFileCache(
file.path(cache_path, "bedsets")
),
AnVIL::Service(
service = "bedbase",
host = "api.bedbase.org",
Expand All @@ -63,7 +80,8 @@ BEDbase <- function(cache_path, quietly = FALSE) {
)
)
info <- httr::content(
bedbase$list_beds_v1_bed_list_get(limit = 0, offset = 0))
bedbase$list_beds_v1_bed_list_get(limit = 0, offset = 0)
)
if (!quietly) {
message(info$count, " BED files available.")
}
Expand All @@ -73,30 +91,35 @@ BEDbase <- function(cache_path, quietly = FALSE) {
#' @rdname BEDbase
#'
#' @param x BEDbase(1) object
#' @param quietly logical(1) (default \code{TRUE}) display messages
#' @param type character(1) bedfiles or bedsets
#'
#' @export
setGeneric("getCache", function(x, quietly = TRUE) standardGeneric("getCache"))
setGeneric(
"getCache",
function(x, cache_type = c("bedfiles", "bedsets")) standardGeneric("getCache")
)

#' Return cache path
#'
#' @param x BEDbase(1) object
#' @param quietly logical(1) (default \code{TRUE}) display messages
#' @param cache_type character(1) bedfiles or bedsets
#'
#' @return BiocFileCache(1) object of BED files
#'
#' @examples
#' bedbase <- BEDbase(tempdir())
#' getCache(bedbase)
#' getCache(bedbase, "bedfiles")
#'
#' @export
setMethod(
"getCache", "BEDbase",
function(x, quietly = TRUE) {
if (quietly) {
BiocFileCache::bfcinfo(x@cache)
function(x, cache_type = c("bedfiles", "bedsets")) {
cache_type <- match.arg(cache_type)
if (cache_type == "bedsets") {
x@bedsets
} else {
x@bedfiles
}
x@cache
}
)

Expand All @@ -112,7 +135,7 @@ setGeneric(
function(x, cache_path, quietly = TRUE) standardGeneric("setCache")
)

#' Set cache path
#' Set cache along path
#'
#' @param x BEDbase(1) object
#' @param cache_path character(1)
Expand All @@ -128,10 +151,14 @@ setGeneric(
setMethod(
"setCache", "BEDbase",
function(x, cache_path, quietly = TRUE) {
x@cache <- BiocFileCache::BiocFileCache(cache_path)
if (quietly) {
BiocFileCache::bfcinfo(x@cache)
}
x@bedfiles <- BiocFileCache::BiocFileCache(
file.path(cache_path, "bedfiles"),
!quietly
)
x@bedsets <- BiocFileCache::BiocFileCache(
file.path(cache_path, "bedsets"),
!quietly
)
x
}
)
Expand Down Expand Up @@ -285,8 +312,9 @@ bb_metadata <- function(bedbase, id, full = FALSE) {
#' bb_list_beds(bedbase)
#'
#' @export
bb_list_beds <- function(bedbase, genome = NULL, bed_type = NULL, limit = 1000,
offset = 0) {
bb_list_beds <- function(
bedbase, genome = NULL, bed_type = NULL, limit = 1000,
offset = 0) {
rsp <- bedbase$list_beds_v1_bed_list_get(
genome = genome, bed_type = bed_type,
limit = limit, offset = offset
Expand Down Expand Up @@ -422,12 +450,22 @@ bb_bed_text_search <- function(bedbase, query, limit = 10, offset = 0) {
#' bb_to_granges(bedbase, ex_bed$id)
#'
#' @export
bb_to_granges <- function(bedbase, bed_id, file_type = "bed", extra_cols = NULL,
quietly = TRUE) {
bb_to_granges <- function(
bedbase, bed_id, file_type = "bed", extra_cols = NULL,
quietly = TRUE) {
stopifnot(file_type %in% c("bed", "bigbed"))
metadata <- bb_metadata(bedbase, bed_id, TRUE)
file_path <- .get_file(metadata, getCache(bedbase), file_type, "http",
quietly)
file_path <- .get_file(
metadata, getCache(bedbase, "bedfiles"), file_type,
"http", quietly
)

bed_file <- tryCatch(
R.utils::gunzip(file_path, remove = FALSE),
error = function(e) {
gsub(".gz", "", file_path)
}
)

if (file_type == "bed") {
.bed_file_to_granges(file_path, metadata, extra_cols, quietly)
Expand Down Expand Up @@ -489,8 +527,9 @@ bb_to_grangeslist <- function(bedbase, bedset_id, quietly = TRUE) {
#' bb_save(bedbase, ex_bed$id, tempdir())
#'
#' @export
bb_save <- function(bedbase, bed_or_bedset_id, path, file_type = "bed",
access_type = "http", quietly = TRUE) {
bb_save <- function(
bedbase, bed_or_bedset_id, path, file_type = "bed",
access_type = "http", quietly = TRUE) {
if (!dir.exists(path)) {
rlang::abort(paste(path, "doesn't exist.", sep = " "))
}
Expand Down
50 changes: 43 additions & 7 deletions R/cache.R
Original file line number Diff line number Diff line change
@@ -1,32 +1,68 @@
#' Create nested path
#'
#' @description Create directory structure following the BEDbase url structure
#' by creating nested directories in the cache.
#'
#' @param file_name character(1) BEDbase file name
#' @param bfc BiocFileCache(1) object
#'
#' @return character(1) file path
#'
#' @examples
#' bedbase_url <- paste0(
#' "https://data2.bedbase.org/files/2/6/",
#' "26a57da7c732a8e63a1dda7ea18af021.bed.gz"
#' )
#' .create_nested_path(bedbase_url, bfc)
#' # [1] "2/6/26a57da7c732a8e63a1dda7ea18af021.bed.gz"
#'
#' @noRd
.create_nested_path <- function(bedbase_url, bfc) {
file_name <- .get_file_name(bedbase_url)
bfc_path <- BiocFileCache::bfccache(bfc)
nested_path <- file.path(bfc_path, substr(file_name, 1, 1))
if (!dir.exists(nested_path)) {
dir.create(nested_path)
}
nested_path <- file.path(nested_path, substr(file_name, 2, 2))
if (!dir.exists(nested_path)) {
dir.create(nested_path)
}
file.path(nested_path, file_name)
}

#' Retrieve path from cache or download file and cache
#'
#' This function is described in the BiocFileCache vignette.
#'
#' @param id character(1) BEDbase id
#' @param url character(1) remote resource
#' @param bedbase_url character(1) remote resource
#' @param bfc BiocFileCache(1) object
#' @param quietly logical(1) (default \code{TRUE}) display message
#'
#' @return character(1) filepath
#'
#' @examples
#' id <- "233479aab145cffe46221475d5af5fae"
#' url <- paste0(
#' bedbase_url <- paste0(
#' "https://data2.bedbase.org/files/2/6/",
#' "26a57da7c732a8e63a1dda7ea18af021.bed.gz"
#' )
#' .download_to_cache(id, url, BiocFileCache::BiocFileCache(tempdir()))
#' .download_to_cache(id, bedbase_url, BiocFileCache::BiocFileCache(tempdir()))
#'
#' @noRd
.download_to_cache <- function(id, url, bfc, quietly = TRUE) {
.download_to_cache <- function(id, bedbase_url, bfc, quietly = TRUE) {
rid <- BiocFileCache::bfcquery(bfc, id, "rname")$rid
if (!length(rid)) {
if (!quietly) {
rlang::inform(paste("Downloading", url, "..."))
rlang::inform(paste("Downloading", bedbase_url, "..."))
}
rpath <- .create_nested_path(bedbase_url, bfc)
download.file(bedbase_url, rpath, quiet = quietly)
rid <- names(BiocFileCache::bfcadd(bfc,
rname = id, fpath = url, rtype = "web",
download = TRUE, verbose = !quietly
rname = id,
fpath = rpath, rtype = "local", rpath = rpath,
download = FALSE, action = "asis", verbose = !quietly
))
}
BiocFileCache::bfcrpath(bfc, rids = rid)
Expand Down
80 changes: 48 additions & 32 deletions R/utils.R
Original file line number Diff line number Diff line change
@@ -1,25 +1,53 @@
#' Format BED file metadata
#' Get file name from URL for a file
#'
#' @param a_url character(1) URL
#'
#' @return character(1) file name
#'
#' @examples
#' url <- "https://this/is/an/example"
#'
#' @noRd
.get_file_name <- function(a_url) {
url_parts <- unlist(strsplit(a_url, "/"))
url_parts[length(url_parts)]
}

#' Get BEDbase url for BED file
#'
#' @param records list(1) metadata
#' @param file_type character(1) bed or bigbed
#' @param access_type character(1) s3 or http
#'
#' @return tibble(1) file metadata
#' @return url to BED file
#'
#' @examples
#' bedbase <- BEDbase()
#' ex_bed <- bb_example(bedbase, "bed")
#' ex_metadata <- bb_metadata(bedbase, ex_bed$id, TRUE)
#' .format_metadata_files(ex_bed$files)
#' .get_url(ex_bed$files, "bed", "http")
#'
#' @noRd
.format_metadata_files <- function(metadata) {
dplyr::bind_rows(metadata) |>
.get_url <- function(
metadata, file_type = c("bed", "bigbed"),
access_type = c("s3", "http")) {
file_type <- match.arg(file_type)
access_type <- match.arg(access_type)
file_details <- dplyr::bind_rows(metadata$files) |>
tidyr::unnest_wider(access_methods) |>
tidyr::unnest_wider(access_url)
tidyr::unnest_wider(access_url) |>
dplyr::filter(
name == paste(file_type, "file", sep = "_"),
access_id == access_type
)
file_details$url
}

#' Save a file from BEDbase to the cache or a path
#' Get a BED file
#'
#' Will create directories that do not exist when saving
#' @description Download or retrieve the file the cache. If not available, get
#' the file from bedbase.org and save to the cache or a path. If a directory
#' does not exist along specified path, it will raise an error message.
#'
#' @param metadata list(1) full metadata
#' @param cache_or_path BiocFileCache(1) or character(1) cache or save path
Expand All @@ -36,33 +64,20 @@
#' .get_file(md, tempdir(), "bed", "http")
#'
#' @noRd
.get_file <- function(metadata, cache_or_path, file_type = c("bed", "bigbed"),
access_type = c("s3", "http"), quietly = TRUE) {
file_details <- .format_metadata_files(metadata$files) |>
dplyr::filter(
name == paste(file_type, "file", sep = "_"),
access_id == access_type
)
.get_file <- function(
metadata, cache_or_path, file_type, access_type,
quietly = TRUE) {
file_url <- .get_url(metadata, file_type, access_type)
if (methods::is(cache_or_path, "BiocFileCache")) {
cached_file <- .download_to_cache(
metadata$id, file_details$url,
bed_file <- .download_to_cache(
metadata$id, file_url,
cache_or_path, quietly
)
bedbase_file <- tryCatch(
R.utils::gunzip(cached_file, remove = FALSE),
error = function(e) {
gsub(".gz", "", cached_file)
}
)
} else {
if (!dir.exists(cache_or_path)) {
dir.create(cache_or_path, recursive = TRUE)
}
url_parts <- unlist(strsplit(file_details$url, "/"))
bedbase_file <- file.path(cache_or_path, url_parts[length(url_parts)])
utils::download.file(file_details$url, bedbase_file, quiet = quietly)
bed_file <- file.path(cache_or_path, .get_file_name(file_url))
utils::download.file(file_url, bed_file, quiet = quietly)
}
bedbase_file
bed_file
}

#' Get extra_cols
Expand Down Expand Up @@ -152,8 +167,9 @@
#' .bed_file_to_granges(file_path, md)
#'
#' @noRd
.bed_file_to_granges <- function(file_path, metadata, extra_cols = NULL,
quietly = TRUE) {
.bed_file_to_granges <- function(
file_path, metadata, extra_cols = NULL,
quietly = TRUE) {
args <- list(con = file_path)
args["format"] <- gsub("peak", "Peak", metadata$bed_format)
nums <- stringr::str_replace(metadata$bed_type, "bed", "") |>
Expand Down
Loading

0 comments on commit 49acf32

Please sign in to comment.